CountVectorizer参数学习

在这里插入图片描述

from sklearn.feature_extraction.text import CountVectorizer
corpus = ['a fine cut', '他 喜欢 动物']
vectorizer = CountVectorizer(binary=False, analyzer='word', vocabulary=['a', '他', '喜欢'], stop_words=['a'])
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())

在这里插入图片描述

指定vocabulary，此时tokenizer/token_pattern/stop_words/max_df等都无效，即和分词有关的参数都无效。可以看到最终生成的词典只有我们参数中指定的a/他/喜欢

为什么会这样呢？我们去研究一下源代码(sklearn/feature_extraction/text.py)：

其中CountVectorizer中的fit_transform的源代码如下所示，需要重点关注的是 self._validate_vocabulary()和self.count_vocab(raw_documents,self.fixed_vocabulary)

def fit_transform(self, raw_documents, y=None):
    if isinstance(raw_documents, six.string_types):
        raise ValueError(
            "Iterable over raw text documents expected, "
            "string object received.")

    self._validate_params()       # 确保n-gram的两个输入值的大小关系正确
    self._validate_vocabulary()  # 如果输入参数包含vocabulary，则使self.fixed_vocabulary_ = True和self.vocabulary_ = dict(vocabulary)
    max_df = self.max_df
    min_df = self.min_df
    max_features = self.max_features

    vocabulary, X = self._count_vocab(raw_documents,
                                      self.fixed_vocabulary_)   # 重点关注

    if self.binary:
        X.data.fill(1)

    if not self.fixed_vocabulary_:
        X = self._sort_features(X, vocabulary)

        n_doc = X.shape[0]
        max_doc_count = (max_df
                         if isinstance(max_df, numbers.Integral)
                         else max_df * n_doc)
        min_doc_count = (min_df
                         if isinstance(min_df, numbers.Integral)
                         else min_df * n_doc)
        if max_doc_count < min_doc_count:
            raise ValueError(
                "max_df corresponds to < documents than min_df")
        X, self.stop_words_ = self._limit_features(X, vocabulary,
                                                   max_doc_count,
                                                   min_doc_count,
                                                   max_features)

        self.vocabulary_ = vocabulary

    return X

如果参数输入包括vocabulary，则生成的是字典对象，否则就会生成defaultdict对象。而如果是字典对象，对于feature_idx = vocabulary[feature]，如果输入是不存在的key，则就会引发异常。

def _count_vocab(self, raw_documents, fixed_vocab):
    if fixed_vocab:
        vocabulary = self.vocabulary_
    else:
        # Add a new value when a new vocabulary item is seen
        vocabulary = defaultdict()
        vocabulary.default_factory = vocabulary.__len__

    analyze = self.build_analyzer()
    j_indices = []
    indptr = []

    values = _make_int_array()
    indptr.append(0)
    for doc in raw_documents:
        feature_counter = {}
        for feature in analyze(doc):
            try:
                feature_idx = vocabulary[feature]
                if feature_idx not in feature_counter:
                    feature_counter[feature_idx] = 1
                else:
                    feature_counter[feature_idx] += 1
            except KeyError:
                # Ignore out-of-vocabulary items for fixed_vocab=True
                continue

        j_indices.extend(feature_counter.keys())
        values.extend(feature_counter.values())
        indptr.append(len(j_indices))

    if not fixed_vocab:
        # disable defaultdict behaviour
        vocabulary = dict(vocabulary)
        if not vocabulary:
            raise ValueError("empty vocabulary; perhaps the documents only"
                             " contain stop words")

    if indptr[-1] > 2147483648:  # = 2**31 - 1
        if sp_version >= (0, 14):
            indices_dtype = np.int64
        else:
            raise ValueError(('sparse CSR array has {} non-zero '
                              'elements and requires 64 bit indexing, '
                              ' which is unsupported with scipy {}. '
                              'Please upgrade to scipy >=0.14')
                             .format(indptr[-1], '.'.join(sp_version)))

    else:
        indices_dtype = np.int32
    j_indices = np.asarray(j_indices, dtype=indices_dtype)
    indptr = np.asarray(indptr, dtype=indices_dtype)
    values = np.frombuffer(values, dtype=np.intc)

    X = sp.csr_matrix((values, j_indices, indptr),
                      shape=(len(indptr) - 1, len(vocabulary)),
                      dtype=self.dtype)
    X.sort_indices()
    return vocabulary, X

在不指定vocabulary的情况下，指定tokenizer，即指定分词的函数，此时stop_words等还是有效的，因为我们指定这个参数，仅仅是确定如何分词，不影响最终生成的词典还需要去停用词，去不满足max_features等的词，但是token_pattern无效。

def build_tokenizer(self):
    """Return a function that splits a string into a sequence of tokens"""
    if self.tokenizer is not None:
        return self.tokenizer
    token_pattern = re.compile(self.token_pattern)
    return lambda doc: token_pattern.findall(doc)

在这里插入图片描述

在不指定vocabulary和tokenizer的情况下，指定token_pattern，该参数和tokenizer相似，指定了要保留哪些词，需要用正则表达式。

build_analyzer: build_preprocessor-> return lambda doc: self._word_ngrams(tokenize(preprocess(self.decode(doc))), stop_words)

CountVectorizer参数学习

猜你喜欢