0fe6aafe48a44cf8ec6f5a86ac3d212622074150,recordlinkage/index.py,SortedNeighbourhood,_link_index,#SortedNeighbourhood#Any#Any#,250
Before Change
pairs_concat = [merge_lagged(data_left, data_right, w)
for w in range(-_window, _window + 1)]
pairs = pandas.concat(pairs_concat, axis=0).set_index(
["index_x", "index_y"]
).index
return pairs
class Random(BaseIndexAlgorithm):
Random(n, replace=True, random_state=None)
After Change
data_right = pandas.DataFrame(
df_b[listify(right_on) + block_right_on], copy=False)
data_right.columns = blocking_keys
data_right["index_y"] = numpy.arange(len(df_b))
data_right.dropna(axis=0, how="any",
subset=blocking_keys,
inplace=True)
// sorting_key_values is the terminology in Data Matching [Christen,
// 2012]
if self.sorting_key_values is None:
self.sorting_key_values = self._get_sorting_key_values(
data_left["sorting_key"].values,
data_right["sorting_key"].values
)
sorting_key_factors = pandas.Series(
numpy.arange(len(self.sorting_key_values)),
index=self.sorting_key_values)
data_left["sorting_key"] = data_left[
"sorting_key"].map(sorting_key_factors)
data_right["sorting_key"] = data_right[
"sorting_key"].map(sorting_key_factors)
// Internal window size
_window = int((window - 1) / 2)
def merge_lagged(x, y, w):
Merge two dataframes with a lag on in the sorting key.
y = y.copy()
y["sorting_key"] = y["sorting_key"] + w
return x.merge(y, how="inner")
pairs_concat = [merge_lagged(data_left, data_right, w)
for w in range(-_window, _window + 1)]
pairs_df = pandas.concat(pairs_concat, axis=0)
return pandas.MultiIndex(
levels=[df_a.index.values, df_b.index.values],
labels=[pairs_df["index_x"].values, pairs_df["index_y"].values],
verify_integrity=False
)
In pattern: SUPERPATTERN
Frequency: 3
Non-data size: 9
Instances
Project Name: J535D165/recordlinkage
Commit Name: 0fe6aafe48a44cf8ec6f5a86ac3d212622074150
Time: 2018-03-10
Author: jonathandebruinhome@gmail.com
File Name: recordlinkage/index.py
Class Name: SortedNeighbourhood
Method Name: _link_index
Project Name: J535D165/recordlinkage
Commit Name: 4a24e4e36ee175aef54b92eb03e08a2be7811a96
Time: 2018-03-10
Author: jonathandebruinhome@gmail.com
File Name: recordlinkage/datasets/febrl.py
Class Name:
Method Name: _febrl_links
Project Name: J535D165/recordlinkage
Commit Name: 0fe6aafe48a44cf8ec6f5a86ac3d212622074150
Time: 2018-03-10
Author: jonathandebruinhome@gmail.com
File Name: recordlinkage/index.py
Class Name: Block
Method Name: _link_index
Project Name: J535D165/recordlinkage
Commit Name: 0fe6aafe48a44cf8ec6f5a86ac3d212622074150
Time: 2018-03-10
Author: jonathandebruinhome@gmail.com
File Name: recordlinkage/index.py
Class Name: SortedNeighbourhood
Method Name: _link_index