6b9c5e536d9de4aa959caff86a27472e8e75d9f7,finetune/input_pipeline.py,BasePipeline,_context_to_vector,#BasePipeline#Any#,167

Before Change


                        padded_indices.append(idx)
                    else:
                        tokens_with_context.append(sample[idx])
                characteristics = pd.DataFrame(tokens_with_context).to_dict("list")

                // make sure all features cover the same number of tokens, and calculate total num tokens
                num_tokens = None
                for label in self.context_labels:

After Change


                    else:
                        tokens_with_context.append(sample[idx])
                //characteristics = pd.DataFrame(tokens_with_context).to_dict("list")
                characteristics = {k:[dictionary[k] for dictionary in tokens_with_context] for k in tokens_with_context[0].keys()}

                // make sure all features cover the same number of tokens, and calculate total num tokens
                num_tokens = None
                for label in self.context_labels:
                    new_length = len(characteristics[label]) if type(characteristics[label] == list) else 1
                    if num_tokens is not None and num_tokens != new_length:
                        raise FinetuneError("Incorrect label shapes.")
                    num_tokens = new_length

                vector = np.zeros((num_tokens + len(padded_indices), self.config.context_dim), dtype=np.float32) // Feature vector for one document. Add 2 for the special tokens at beginning/end
                current_index = 0

                // Loop through each feature and add each to new index of the feature vector
                for label in self.context_labels:
                    if label == self.config.pad_token:
                        continue
                    data = characteristics[label]

                    // Binary encoded features have different dimensionality as simple floats/ints, so must be handled differently.
                    if label in self.label_encoders.keys():
                        without_nans = [x for x in data if not pd.isna(x)]
                        data = self.label_encoders[label].transform(without_nans) //this removes nans from padded tokens, but now the list is too short. The "num_backward" variable will track this offset to ensure indices are correctly tracked.
                        data_dim = len(self.label_encoders[label].classes_)
                        if data_dim == 2: // since binary classes default to column vector
                            data_dim=1
                    else:
                        data = [x for x in data if not pd.isna(x)]
                        data_dim = 1
                    //print(label)
                    //print(self.label_encoders.keys())
                    //loop through indices and fill with correct data
                    num_backward = 0
                    for sample_idx in range(num_tokens):
                        if sample_idx in padded_indices: // there is no data, simply a pad from the encoder, so fill out with zero vector
                            vector[sample_idx][:] = 0
                            num_backward += 1 //since we"re skipping this sample, the next sample needs to be filled from this sample"s index in data. This variable tracks how far back we need to go for following indices
                            continue
                        for label_dimension in range(data_dim):
                            //print("Label dim:" + str(label_dimension + current_index) + "Sample Index:" + str(sample_idx) + "Current label" + label)
                            vector[sample_idx][current_index + label_dimension] = data[sample_idx - num_backward] if data_dim  == 1 else data[sample_idx - num_backward][label_dimension]

                    current_index += 1
                vector_list.append(vector)
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 5

Instances


Project Name: IndicoDataSolutions/finetune
Commit Name: 6b9c5e536d9de4aa959caff86a27472e8e75d9f7
Time: 2019-08-02
Author: matthew.bayer@indico.io
File Name: finetune/input_pipeline.py
Class Name: BasePipeline
Method Name: _context_to_vector


Project Name: NifTK/NiftyNet
Commit Name: 223bce2eb5903bf58a12250f3ea5585c33a7778e
Time: 2019-06-05
Author: carole.sudre@kcl.ac.uk
File Name: niftynet/contrib/csv_reader/csv_reader.py
Class Name: CSVReader
Method Name: initialise


Project Name: instacart/lore
Commit Name: 10332a44b9a5c2d637f614df392fe9c796dfdf0a
Time: 2018-06-01
Author: montana@instacart.com
File Name: lore/encoders.py
Class Name: Unique
Method Name: fit