else:
tokens_with_context.append(sample[idx])
//characteristics = pd.DataFrame(tokens_with_context).to_dict("list")
characteristics = {k:[dictionary[k] for dictionary in tokens_with_context] for k in tokens_with_context[0].keys()}
// make sure all features cover the same number of tokens, and calculate total num tokens
num_tokens = None
for label in self.context_labels:
new_length = len(characteristics[label]) if type(characteristics[label] == list) else 1
if num_tokens is not None and num_tokens != new_length:
raise FinetuneError("Incorrect label shapes.")
num_tokens = new_length
vector = np.zeros((num_tokens + len(padded_indices), self.config.context_dim), dtype=np.float32) // Feature vector for one document. Add 2 for the special tokens at beginning/end
current_index = 0
// Loop through each feature and add each to new index of the feature vector
for label in self.context_labels:
if label == self.config.pad_token:
continue
data = characteristics[label]
// Binary encoded features have different dimensionality as simple floats/ints, so must be handled differently.
if label in self.label_encoders.keys():
without_nans = [x for x in data if not pd.isna(x)]
data = self.label_encoders[label].transform(without_nans) //this removes nans from padded tokens, but now the list is too short. The "num_backward" variable will track this offset to ensure indices are correctly tracked.
data_dim = len(self.label_encoders[label].classes_)
if data_dim == 2: // since binary classes default to column vector
data_dim=1
else:
data = [x for x in data if not pd.isna(x)]
data_dim = 1
//print(label)
//print(self.label_encoders.keys())
//loop through indices and fill with correct data
num_backward = 0
for sample_idx in range(num_tokens):
if sample_idx in padded_indices: // there is no data, simply a pad from the encoder, so fill out with zero vector
vector[sample_idx][:] = 0
num_backward += 1 //since we"re skipping this sample, the next sample needs to be filled from this sample"s index in data. This variable tracks how far back we need to go for following indices
continue
for label_dimension in range(data_dim):
//print("Label dim:" + str(label_dimension + current_index) + "Sample Index:" + str(sample_idx) + "Current label" + label)
vector[sample_idx][current_index + label_dimension] = data[sample_idx - num_backward] if data_dim == 1 else data[sample_idx - num_backward][label_dimension]
current_index += 1
vector_list.append(vector)