python#特征工程和文本特征提取

字典数据特征抽取

DictVectorizer(语法)
DictVectorizer.fit_transfromx()
-->x：字典或者包涵字典的迭代器（放在列表里 ）
-->返回值：返回sparse矩阵

from sklearn.feature_extraction import DictVectorizer


def dictvec():
    """
    字典数据抽取
    """
    # 实例化
    dict = DictVectorizer()

    # 调用fit_transform,返回稀疏矩阵
    data = dict.fit_transform([
            {"city": "北京", 'temperature': 100},
            {"city": "上海", 'temperature': 60},
            {"city": "深圳", 'temperature': 30}
        ])
    print(data)

    return None

if __name__== '__main__':
    dictvec()

---------上面那个的输出
 	(0, 1)	1.0
  (0, 3)	100.0
  (1, 0)	1.0
  (1, 3)	60.0
  (2, 2)	1.0
  (2, 3)	30.0
  ------------
  节约内存，方便读取处理

dict = DictVectorizer(sparse=False) 把他变好看点
----------------
[[  0.   1.   0. 100.]
 [  1.   0.   0.  60.]
 [  0.   0.   1.  30.]]

 第0行，第一个位置是1，第3位置是100
 ----------------
  print(dict.get_feature_names())

总结：字典数据抽取，把字典中一些类别的一些数据，分别进行转换成特征，数值类型不变，数组形式，有类别的

[[ 0. 1. 0]
[ 1. 0. 0]
[ 0. 0. 1]]
one-hot编码

文本特征抽取

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer


def countvec():
   """
   对文本进行特征值化
   """
   cv = CountVectorizer()
   data=cv.fit_transform(['life is gone，is dog','your are pig'])
   print(cv.get_feature_names())  # 1.统计所有文章中，所有的值，重复值看做一次，词的列表，2.对每篇文章，在词的列表里面进行统计每个词出现的次数

   print(data.toarray()) # 单个字母不统计
   return None


if __name__== '__main__':
   countvec()
-----------------

['are', 'dog', 'gone', 'is', 'life', 'pig', 'your']
[[0 1 1 2 1 0 0]
[1 0 0 0 0 1 1]]

中文文本抽取

1.得先对中文分词,使用jieba
2.使用import jieba
3.jieba.cut("")
4.返回一个词语生成器

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import jieba


def cutword():
    con1 = jieba.cut("你是个大傻子吗？为什么不给我分词？")
    con3 = jieba.cut('今天很残酷，明天很美好？')
    con2 = jieba.cut('你不会是个大傻子吧,我知道你是个大傻子')

    # 转换列表
    conturn1 = list(con1)
    conturn2 = list(con2)
    conturn3 = list(con3)
    print(f"con1:{conturn1}")
    # 转换成字符串
    c1 = ' '.join(conturn1)
    c2 = ' '.join(conturn2)  # 进一个字符  加一个空格 
    c3 = ' '.join(conturn3)
    print(f"c1:{c1}")
    return c1,c2,c3


def hanzivec():
    """
    中文汉字特征化
    """
    c1,c2,c3 = cutword()
    print(c1,c2,c3)
    cv = CountVectorizer()
    data = cv.fit_transform([c1,c2,c3])
    print(cv.get_feature_names())

    print(data.toarray())



if __name__== '__main__':
    hanzivec()

tf—idf

Tf:term frequency:词的频率–出现的次数
Idf：inverse document frequency----log(总文档数量/该次出现的文档数)
tf–idf–>重要性
tf-IDF：用以评估一个字词对于一个文件集或一个语料库中其中一份文件的重要程度

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import jieba


def cutword():
    con1 = jieba.cut("你是个大傻子吗？为什么不给我分词？")
    con2 = jieba.cut('今天很残酷，明天很美好？')
    con3 = jieba.cut('我是一个小小小小鸟，我飞，我飞，飞不高，啦啦')

    # 转换列表
    conturn1 = list(con1)
    conturn2 = list(con2)
    conturn3 = list(con3)
   # print(f"con1:{conturn1}")
    # 转换成字符串
    c1 = ' '.join(conturn1)
    c2 = ' '.join(conturn2)  # 进一个字符  加一个空格
    c3 = ' '.join(conturn3)
   # print(f"c1:{c1}")
    return c1,c2,c3


def hanzivec():
    """
    中文汉字特征化
    """
    c1,c2,c3 = cutword()
    print(c1,c2,c3)
    cv = CountVectorizer()
    data = cv.fit_transform([c1,c2,c3])
    print(cv.get_feature_names())

    print(data.toarray())

def tfidfvec():
    """
    中文汉字特征化
    """
    c1,c2,c3 = cutword()
    #print(c1,c2,c3)
    tf = TfidfVectorizer()
    data = tf.fit_transform([c1,c2,c3])
    print(tf.get_feature_names())

    print(data.toarray())


if __name__== '__main__':
    tfidfvec()

为什么需要tfidVectorizer–分类机器学习算法的重要依据

Rcefcn

发布了39 篇原创文章 · 获赞 1 · 访问量 391

私信关注

python#特征工程和文本特征提取

字典数据特征抽取

文本特征抽取

中文文本抽取

tf—idf

猜你喜欢