python片段,生成语料中词以及词对应Id

'''
sent = [['I','am', 'a', 'student', '.'],['who', 'are', 'you','?'],['my', 'name', 'is', 'student']]
生成词汇集合
'''
def build_vocab(sentences):
    # build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()] # most_common(n)返回一个TopN列表, 如果n没有被指定,则返回所以元素
                                                               # 当所有元素计数值相同时,按照字母序列排序
    vocabulary_inv = list((vocabulary_inv))
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return vocabulary_inv, vocabulary

# Maps sentences and labels to vectors based on a vocabulary
def build_input_data(sentences, labels, vocabulary):
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return x, y

版权声明:本文为u014221266原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接和本声明。