Confidence of the predictive distribution model

Calculate the confidence in positive predictions within known presences (CPP, type = "positive") or confidence in predictions within known presences (CP, type = "neutral") based on the occurrence observations, the predictions of the probability of occurrence, and the two thresholds distinguishing certain negatives/positives from uncertain predictions.

Usage

confidence(
  observations,
  predictions,
  thresholds = confcons::thresholds(observations = observations, predictions =
    predictions),
  type = "positive"
)

Arguments

observations: Either an integer or logical vector containing the binary observations where presences are encoded as 1s/TRUEs and absences as 0s/FALSEs.
predictions: A numeric vector containing the predicted probabilities of occurrence typically within the [0, 1] interval. length(predictions) should be equal to length(observations) and the order of the elements should match.
thresholds: A numeric vector of length two, typically calculated by thresholds(). The first element distinguishes certain negatives (certain absences) from uncertain predictions. The second element distinguishes certain positives (certain presences) from uncertain predictions. If missing, confcons::thresholds(observations = observations, predictions = predictions) is called, but see section 'Note' about why you should not use the default value.
type: A character vector of length one containing the value "positive" (for calculating confidence in positive predictions within known presences (CPP)) or "neutral" (for calculating confidence in predictions within known presences (CP)). Defaults to "positive".

Value

A numeric vector of length one. It is either NA_real_ or a positive number within the [0, 1] interval. Larger value indicates that the model is more confident.

Note

Technically, confidence can be calculated for the training subset, the evaluation subset, or the whole dataset as well. Note, however, that there is not so much sense to calculate confidence in the training subset, except for using the result for consistency calculation. If you need only the confidence measure, calculate it on the evaluation subset using thresholds previously determined on the whole dataset (i.e., do not use the default value of parameter thresholds). See the last example below and the vignette.

Examples

set.seed(12345)

# Using logical observations, default 'thresholds' and 'type' parameter:
observations_1000_logical <- c(rep(x = FALSE, times = 500),
                               rep(x = TRUE, times = 500))
predictions_1000 <- c(runif(n = 500, min = 0, max = 0.7),
                      runif(n = 500, min = 0.3, max = 1))
confidence(observations = observations_1000_logical,
           predictions = predictions_1000) # 0.561
#> [1] 0.5607064

# Using integer observations, default 'thresholds' parameter,
# both 'positive' and 'neutral' confidence type:
observations_4000_integer <- c(rep(x = 0L, times = 3000),
                               rep(x = 1L, times = 1000))
predictions_4000 <- c(runif(n = 3000, min = 0, max = 0.8),
                      runif(n = 1000, min = 0.2, max = 0.9))
confidence(observations = observations_4000_integer,
           predictions = predictions_4000, type = "positive") # 0.691
#> [1] 0.6912378
confidence(observations = observations_4000_integer,
           predictions = predictions_4000, type = "neutral") # 0.778
#> [1] 0.778

# Using some previously selected thresholds:
strict_thresholds <- c(0.1, 0.9)
permissive_thresholds <- c(0.4, 0.5)
percentile_thresholds <- quantile(x = predictions_4000[observations_4000_integer == 1],
                                  probs = c(0.1, 0.9)) # 10th and 90th percentile
confidence(observations = observations_4000_integer,
           predictions = predictions_4000,
           thresholds = strict_thresholds,
           type = "neutral") # 0
#> [1] 0
confidence(observations = observations_4000_integer,
           predictions = predictions_4000,
           thresholds = permissive_thresholds,
           type = "neutral") # 0.836
#> [1] 0.836
confidence(observations = observations_4000_integer,
           predictions = predictions_4000,
           thresholds = percentile_thresholds,
           type = "neutral") # 0.2
#> [1] 0.2

# Real-life case
# (thresholds calculated from the whole dataset, confidence from the evaluation subset):
dataset <- data.frame(
  observations = observations_4000_integer,
  predictions = predictions_4000,
  evaluation_mask = c(rep(x = FALSE, times = 250),
                      rep(x = TRUE, times = 250),
                      rep(x = FALSE, times = 250),
                      rep(x = TRUE, times = 250))
)
thresholds_whole <- thresholds(observations = dataset$observations,
                               predictions = dataset$predictions)
(confidence_evaluation <- confidence(observations = dataset$observations[dataset$evaluation_mask],
                                     predictions = dataset$predictions[dataset$evaluation_mask],
                                     thresholds = thresholds_whole)) # 0.671
#> [1] 0.6713092

# Wrong parameterization:
try(confidence(observations = observations_1000_logical,
               predictions = predictions_1000,
               type = "pos")) # error
#> Error in confidence(observations = observations_1000_logical, predictions = predictions_1000,  : 
#>   Parameter 'type' must be 'positive' or 'neutral'.
try(confidence(observations = observations_1000_logical,
               predictions = predictions_1000,
               thresholds = c(0.2, NA_real_))) # warning
#> Warning: Parameter 'thresholds' is expected to contain numbers falling within the [0, 1] interval, but found to contain 0.200 and NA.
#> [1] NA
try(confidence(observations = observations_1000_logical,
               predictions = predictions_1000,
               thresholds = c(-0.4, 0.85))) # warning
#> Warning: Parameter 'thresholds' is expected to contain numbers falling within the [0, 1] interval, but found to contain -0.400 and 0.850.
#> [1] 0.186
try(confidence(observations = observations_1000_logical,
               predictions = predictions_1000,
               thresholds = c(0.6, 0.3))) # warning
#> Warning: Parameter 'thresholds' is expected to contain two numbers increasing strictly monotonously, i.e. thresholds[1] < thresholds[2], but found to contain 0.600 and 0.300, respectively.
#> [1] 1
try(confidence(observations = observations_1000_logical,
               predictions = predictions_4000)) # error
#> Error in confidence(observations = observations_1000_logical, predictions = predictions_4000) : 
#>   The length of parameters 'observations' and 'predictions' should be the same.
set.seed(12345)
observations_4000_numeric <- c(rep(x = 0, times = 3000),
                               rep(x = 1, times = 1000))
predictions_4000_strange <- c(runif(n = 3000, min = -0.3, max = 0.4),
                              runif(n = 1000, min = 0.6, max = 1.5))
try(confidence(observations = observations_4000_numeric,
               predictions = predictions_4000_strange,
               thresholds = c(0.2, 0.7))) # multiple warnings
#> Warning: I found that parameter 'observations' is not an integer or logical vector. Coercion is done.
#> Warning: Strange predicted values found. Parameter 'predictions' preferably contains numbers falling within the [0,1] interval.
#> [1] 0.883
mask_of_normal_predictions <- predictions_4000_strange >= 0 & predictions_4000_strange <= 1
confidence(observations = as.integer(observations_4000_numeric)[mask_of_normal_predictions],
           predictions = predictions_4000_strange[mask_of_normal_predictions],
           thresholds = c(0.2, 0.7)) # OK
#> [1] 0.7434211