sklearn TF-IDF 源码解析

最新推荐文章于 2024-09-13 05:23:22 发布

原创最新推荐文章于 2024-09-13 05:23:22 发布 · 1k 阅读

3 ·

CC 4.0 BY-SA版权

文章标签：

#Tf-idf #python #关键词提取

数据结构与算法专栏收录该内容

11 篇文章

订阅专栏

本文深入探讨了文本向量化技术，包括CountVectorizer和TfidfVectorizer的使用及原理。通过实例展示了如何将文本转换为数值特征，重点讲解了词频、逆文档频率的概念及其在文本预处理中的应用。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = CountVectorizer()
count = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())  
print(vectorizer.vocabulary_)
print(count.toarray())

结果为可以看出来先形成一个字典，作为特征，每句话都是字典对应词出现的频率

再看一下TfidfVectorizer的使用

bb = TfidfVectorizer(use_idf=False)
bb = bb.fit_transform(corpus)
print(bb.toarray())

看一下TfidfVectorizer 源码，只留下主要代码就是这个，可以看到它继承了CountVectorizer类

class TfidfVectorizer(CountVectorizer):
    

    def __init__(self, input='content', encoding='utf-8',
                 decode_error='strict', strip_accents=None, lowercase=True,
                 preprocessor=None, tokenizer=None, analyzer='word',
                 stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                 ngram_range=(1, 1), max_df=1.0, min_df=1,
                 max_features=None, vocabulary=None, binary=False,
                 dtype=np.float64, norm='l2', use_idf=True, smooth_idf=True,
                 sublinear_tf=False):

        super().__init__(
            input=input, encoding=encoding, decode_error=decode_error,
            strip_accents=strip_accents, lowercase=lowercase,
            preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer,
            stop_words=stop_words, token_pattern=token_pattern,
            ngram_range=ngram_range, max_df=max_df, min_df=min_df,
            max_features=max_features, vocabulary=vocabulary, binary=binary,
            dtype=dtype)

        self._tfidf = TfidfTransformer(norm=norm, use_idf=use_idf,
                                       smooth_idf=smooth_idf,
                                       sublinear_tf=sublinear_tf)


    def fit_transform(self, raw_documents, y=None):  
        self._check_params()
        X = super().fit_transform(raw_documents)
        self._tfidf.fit(X)
        return self._tfidf.transform(X, copy=False)

它先执行了CountVectorizer中的fit_transform，得到的X为前面看到的词频的矩阵（稀疏矩阵）

然后执行TfidfTransformer中的fit和transform

    def fit(self, X, y=None):
      
        X = check_array(X, accept_sparse=('csr', 'csc'))
        if not sp.issparse(X):
            X = sp.csr_matrix(X)
        dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64

        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)

            # print(df)  # [1 3 2 3 1 1 4 1 3]这里df为为每个词在所有文档中的出现频率,一共4条句子，最多是
            df = df.astype(dtype, **_astype_copy_false(df))

            #分子分母都+1。更平滑
            df += int(self.smooth_idf) #self.smooth_idf为bool True的话就+1 False就不变
            n_samples += int(self.smooth_idf)

            #后面+1为了结果不为0
            idf = np.log(n_samples / df) + 1  #  归一化 ，转换为概率形式
            self._idf_diag = sp.diags(idf, offsets=0,
                                      shape=(n_features, n_features),
                                      format='csr',
                                      dtype=dtype)

        return self

    def transform(self, X, copy=True):
    
        X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy)
        if not sp.issparse(X):
            X = sp.csr_matrix(X, dtype=np.float64)

        n_samples, n_features = X.shape

        if self.sublinear_tf:
            np.log(X.data, X.data)
            X.data += 1

        if self.use_idf:
            check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')

            expected_n_features = self._idf_diag.shape[0]
            if n_features != expected_n_features:
                raise ValueError("Input has n_features=%d while the model"
                                 " has been trained with n_features=%d" % (
                                     n_features, expected_n_features))
            # *= doesn't work
            X = X * self._idf_diag

        if self.norm:
            X = normalize(X, norm=self.norm, copy=False)

        return X

在fit中计算了一个idf矩阵，存到self._dif_diag中，然后在transform的时候乘一下， df就是把CountVectorizer中的结果进行了一个l2归一化。

测试一下不用idf，use_idf为false的时候

from sklearn.preprocessing import normalize

vectorizer = CountVectorizer()
count = vectorizer.fit_transform(corpus)
print(normalize(count, 'l2').toarray())
#结果和
bb = TfidfVectorizer(use_idf=False)
bb = bb.fit_transform(corpus)
print(bb.toarray())
#结果一样