27bbb7015dc6bbe02e00bb1853e7952ac13e7fe0,gensim/models/word2vec.py,BrownCorpus,__iter__,#BrownCorpus#,1352

Before Change


            fname = os.path.join(self.dirname, fname)
            if not os.path.isfile(fname):
                continue
            for line in utils.smart_open(fname):
                line = utils.to_unicode(line)
                // each file line is a single sentence in the Brown corpus
                // each token is WORD/POS_TAG
                token_tags = [t.split("/") for t in line.split() if len(t.split("/")) == 2]
                // ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
                words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
                if not words:  // don"t bother sending out empty sentences
                    continue
                yield words


class Text8Corpus(object):
    Iterate over sentences from the "text8" corpus, unzipped from http://mattmahoney.net/dc/text8.zip.
    def __init__(self, fname, max_sentence_length=MAX_WORDS_IN_BATCH):
        self.fname = fname

After Change


            fname = os.path.join(self.dirname, fname)
            if not os.path.isfile(fname):
                continue
            with utils.open(fname, "rb") as fin:
                for line in fin:
                    line = utils.to_unicode(line)
                    // each file line is a single sentence in the Brown corpus
                    // each token is WORD/POS_TAG
                    token_tags = [t.split("/") for t in line.split() if len(t.split("/")) == 2]
                    // ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff)
                    words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()]
                    if not words:  // don"t bother sending out empty sentences
                        continue
                    yield words


class Text8Corpus(object):
    Iterate over sentences from the "text8" corpus, unzipped from http://mattmahoney.net/dc/text8.zip.
    def __init__(self, fname, max_sentence_length=MAX_WORDS_IN_BATCH):
        self.fname = fname
Italian Trulli
In pattern: SUPERPATTERN

Frequency: 3

Non-data size: 8

Instances


Project Name: RaRe-Technologies/gensim
Commit Name: 27bbb7015dc6bbe02e00bb1853e7952ac13e7fe0
Time: 2019-07-07
Author: itay.bittan@gmail.com
File Name: gensim/models/word2vec.py
Class Name: BrownCorpus
Method Name: __iter__


Project Name: RaRe-Technologies/gensim
Commit Name: 27bbb7015dc6bbe02e00bb1853e7952ac13e7fe0
Time: 2019-07-07
Author: itay.bittan@gmail.com
File Name: gensim/models/deprecated/word2vec.py
Class Name: BrownCorpus
Method Name: __iter__


Project Name: RaRe-Technologies/gensim
Commit Name: 27bbb7015dc6bbe02e00bb1853e7952ac13e7fe0
Time: 2019-07-07
Author: itay.bittan@gmail.com
File Name: gensim/models/deprecated/keyedvectors.py
Class Name: EuclideanKeyedVectors
Method Name: accuracy