from sklearn.feature_extraction.text import CountVectorizer
content = ["life is short,i like python","life is too long,i dislike python"]
vectorizer = CountVectorizer()
print(vectorizer.fit_transform(content).toarray())
需要toarray()方法转变为numpy的数组形式
2TF-IDF表示词的重要性
TfidfVectorizer会根据指定的公式将文档中的词转换为概率表示
方法
fit_transform(raw_documents,y) 学习词汇和idf,返回术语文档矩阵。
from sklearn.feature_extraction.text import TfidfVectorizer
content = ["life is short,i like python","life is too long,i dislike python"]
vectorizer = TfidfVectorizer(stop_words='english')
print(vectorizer.fit_transform(content).toarray())
print(vectorizer.vocabulary_)