gensim
Quick Start
raw_corpus = ["Human machine interface for lab abc computer applications",
"A survey of user opinion of computer system response time",
"The EPS user interface management system",
"System and human system engineering testing of EPS",
"Relation of user perceived response time to error measurement",
"The generation of random binary unordered trees",
"The intersection graph of paths in trees",
"Graph minors IV Widths of trees and well quasi ordering",
"Graph minors A survey"]
# 常用词set
stoplist = set('for a of the and to in'.split(' '))
# 转化成小写,用空格切分
texts = [[word for word in document.lower().split() if word not in stoplist]
for document in raw_corpus]
# 计算词频
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
# 仅保留词频大于1的词
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
processed_corpus
OUT
[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]
from gensim import corpora
# 创建词-词频字典
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)
print(dictionary.token2id)
OUT
Dictionary(12 unique tokens: [u'minors', u'graph', u'system', u'trees', u'eps']...) {u'minors': 11, u'graph': 10, u'system': 6, u'trees': 9, u'eps': 8, u'computer': 1, u'survey': 5, u'user': 7, u'human': 2, u'time': 4, u'interface': 0, u'response': 3}
# 转化成词向量
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
bow_corpus
OUT
[[(0, 1), (1, 1), (2, 1)], [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(0, 1), (6, 1), (7, 1), (8, 1)], [(2, 1), (6, 2), (8, 1)], [(3, 1), (4, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(5, 1), (10, 1), (11, 1)]]
from gensim import models
# train the model
tfidf = models.TfidfModel(bow_corpus)
# transform the "system minors" sting
tfidf[dictionary.doc2bow("system minors".lower().split())]
OUT
[(6, 0.5898341626740045), (11, 0.8075244024440723)]
word2vec
from gensim.models.word2vec import Word2Vec
sentences = [['first', 'sentence'], ['second', 'sentence']]
model = Word2Vec(sentences, size=5, min_count=1)
print len(model.wv.vocab)
print model.wv.vocab
print model
print model['first']
OUT
3 {'second': <gensim.models.keyedvectors.Vocab object at 0x7f0cd960e8d0>, 'first': <gensim.models.keyedvectors.Vocab object at 0x7f0cdb67b3d0>, 'sentence': <gensim.models.keyedvectors.Vocab object at 0x7f0cdb67b850>} Word2Vec(vocab=3, size=5, alpha=0.025) [-0.0295424 -0.06962696 0.0141016 0.04448513 0.01840453]
- 读取已分词文件.txt
class MyText(object):
def __iter__(self):
for line in open('/fPath/fname.txt'):
yield line.split()
sentences = MyText()
model = Word2Vec(sentences, size=100, min_count=5)