// Add uncertainty scores to our unlabeled data, and keep a copy of our unlabeled data.
unlabeled_uncertainty = np.concatenate((unlabeled, np.expand_dims(uncertainty_scores, axis=1)), axis=1)
unlabeled_uncertainty_copy = np.copy(unlabeled_uncertainty)
// Define our record container and the maximum number of records to sample.
instance_index_ranking = []
ceiling = np.minimum(unlabeled.shape[0], n_instances)
// TODO (dataframing) is there a better way to do this? Inherently sequential.
for _ in range(ceiling):
// Select the instance from our unlabeled copy that scores highest.
raw_instance = select_instance(X_training=labeled, X_uncertainty=unlabeled_uncertainty_copy)
instance = np.expand_dims(raw_instance, axis=1)
// Find our record"s index in both the original unlabeled and our uncertainty copy.
instance_index_original = np.where(np.all(unlabeled == raw_instance, axis=1))[0][0]
instance_index_copy = np.where(np.all(unlabeled_uncertainty_copy[:, :-1] == instance.T, axis=1))[0][0]
// Add our instance we"ve considered for labeling to our labeled set. Although we don"t
// know it"s label, we want further iterations to consider the newly-added instance so
// that we don"t query the same instance redundantly.
labeled = np.concatenate((labeled, instance.T), axis=0)
// Remove our instance from the unlabeled set and append it to our list of records to label.
unlabeled_uncertainty_copy = np.delete(unlabeled_uncertainty_copy, instance_index_copy, axis=0)
instance_index_ranking.append(instance_index_original)
// Return numpy array, not a list.
return np.array(instance_index_ranking)
After Change
unlabeled_uncertainty = np.concatenate((unlabeled, expanded_uncertainty_scores), axis=1)
// Define our null row, which will be filtered during the select_instance call.
null_row = np.ones(shape=(unlabeled_uncertainty.shape[1],)) * np.nan
// Define our record container and the maximum number of records to sample.
instance_index_ranking = []
ceiling = np.minimum(unlabeled.shape[0], n_instances)
for _ in range(ceiling):
// Receive the instance and corresponding index from our unlabeled copy that scores highest.
instance_index, instance = select_instance(
X_training=labeled, X_uncertainty=unlabeled_uncertainty
)
// Prepare our most informative instance for concatenation.
expanded_instance = np.expand_dims(instance, axis=0)
// Add our instance we"ve considered for labeling to our labeled set. Although we don"t
// know it"s label, we want further iterations to consider the newly-added instance so
// that we don"t query the same instance redundantly.
labeled = np.concatenate((labeled, expanded_instance), axis=0)
// We "remove" our instance from the unlabeled set by setting that row to an array
// of np.nan and filtering within select_instance.
unlabeled_uncertainty[instance_index] = null_row
// Finally, append our instance"s index to the bottom of our ranking.
instance_index_ranking.append(instance_index)