0fe6aafe48a44cf8ec6f5a86ac3d212622074150,recordlinkage/index.py,SortedNeighbourhood,_link_index,#SortedNeighbourhood#Any#Any#,250

Before Change


        pairs_concat = [merge_lagged(data_left, data_right, w)
                        for w in range(-_window, _window + 1)]

        pairs = pandas.concat(pairs_concat, axis=0).set_index(
            ["index_x", "index_y"]
        ).index

        return pairs


class Random(BaseIndexAlgorithm):
    Random(n, replace=True, random_state=None)

After Change


        data_right = pandas.DataFrame(
            df_b[listify(right_on) + block_right_on], copy=False)
        data_right.columns = blocking_keys
        data_right["index_y"] = numpy.arange(len(df_b))
        data_right.dropna(axis=0, how="any",
                          subset=blocking_keys,
                          inplace=True)

        // sorting_key_values is the terminology in Data Matching [Christen,
        // 2012]
        if self.sorting_key_values is None:

            self.sorting_key_values = self._get_sorting_key_values(
                data_left["sorting_key"].values,
                data_right["sorting_key"].values
            )

        sorting_key_factors = pandas.Series(
            numpy.arange(len(self.sorting_key_values)),
            index=self.sorting_key_values)

        data_left["sorting_key"] = data_left[
            "sorting_key"].map(sorting_key_factors)
        data_right["sorting_key"] = data_right[
            "sorting_key"].map(sorting_key_factors)

        // Internal window size
        _window = int((window - 1) / 2)

        def merge_lagged(x, y, w):
            Merge two dataframes with a lag on in the sorting key.

            y = y.copy()
            y["sorting_key"] = y["sorting_key"] + w

            return x.merge(y, how="inner")

        pairs_concat = [merge_lagged(data_left, data_right, w)
                        for w in range(-_window, _window + 1)]

        pairs_df = pandas.concat(pairs_concat, axis=0)

        return pandas.MultiIndex(
            levels=[df_a.index.values, df_b.index.values],
            labels=[pairs_df["index_x"].values, pairs_df["index_y"].values],
            verify_integrity=False
        )
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 9

Instances


Project Name: J535D165/recordlinkage
Commit Name: 0fe6aafe48a44cf8ec6f5a86ac3d212622074150
Time: 2018-03-10
Author: jonathandebruinhome@gmail.com
File Name: recordlinkage/index.py
Class Name: SortedNeighbourhood
Method Name: _link_index


Project Name: J535D165/recordlinkage
Commit Name: 4a24e4e36ee175aef54b92eb03e08a2be7811a96
Time: 2018-03-10
Author: jonathandebruinhome@gmail.com
File Name: recordlinkage/datasets/febrl.py
Class Name:
Method Name: _febrl_links


Project Name: J535D165/recordlinkage
Commit Name: 0fe6aafe48a44cf8ec6f5a86ac3d212622074150
Time: 2018-03-10
Author: jonathandebruinhome@gmail.com
File Name: recordlinkage/index.py
Class Name: Block
Method Name: _link_index


Project Name: J535D165/recordlinkage
Commit Name: 0fe6aafe48a44cf8ec6f5a86ac3d212622074150
Time: 2018-03-10
Author: jonathandebruinhome@gmail.com
File Name: recordlinkage/index.py
Class Name: SortedNeighbourhood
Method Name: _link_index