dad1abdcffb4d37256502a73a1c236aa2f07636b,pycorrector/bert/bert_corrector.py,BertCorrector,bert_correct,#BertCorrector#Any#,33
Before Change
:return: list[list], [error_word, begin_pos, end_pos, error_type]
maybe_errors = []
for idx, s in enumerate(sentence):
// 对非中文的错误不做处理
if not is_chinese_string(s):
continue
sentence_lst = list(sentence)
sentence_lst[idx] = self.mask
sentence_new = "".join(sentence_lst)
predicts = self.model(sentence_new)
top_tokens = []
for p in predicts:
token_id = p.get("token", 0)
token_str = self.model.tokenizer.convert_ids_to_tokens(token_id)
top_tokens.append(token_str)
if top_tokens and (s not in top_tokens):
// 取得所有可能正确的词
candidates = self.generate_items(s)
if not candidates:
continue
for token_str in top_tokens:
if token_str in candidates:
maybe_errors.append([s, token_str, idx, idx + 1])
break
return maybe_errors
if __name__ == "__main__":
After Change
// 编码统一,utf-8 to unicode
text = convert_to_unicode(text)
// 长句切分为短句
blocks = self.split_2_short_text(text, include_symbol=True)
for blk, start_idx in blocks:
blk_new = ""
for idx, s in enumerate(blk):
// 对非中文的错误不做处理
In pattern: SUPERPATTERN
Frequency: 3
Non-data size: 5
Instances Project Name: shibing624/pycorrector
Commit Name: dad1abdcffb4d37256502a73a1c236aa2f07636b
Time: 2020-03-17
Author: xuming624@qq.com
File Name: pycorrector/bert/bert_corrector.py
Class Name: BertCorrector
Method Name: bert_correct
Project Name: shibing624/pycorrector
Commit Name: 8f76475c3e5b3533b40e7ba41ca11ec987ffe974
Time: 2020-03-15
Author: xuming624@qq.com
File Name: pycorrector/corrector.py
Class Name: Corrector
Method Name: correct
Project Name: shibing624/pycorrector
Commit Name: 8f76475c3e5b3533b40e7ba41ca11ec987ffe974
Time: 2020-03-15
Author: xuming624@qq.com
File Name: pycorrector/detector.py
Class Name: Detector
Method Name: detect