corpus = [
'This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?',
]
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = CountVectorizer()
count = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(vectorizer.vocabulary_)
print(count.toarray())
结果为可以看出来 先形成一个字典,作为特征,每句话都是字典对应词出现的频率
再看一下TfidfVectorizer的使用
bb = TfidfVectorizer(use_idf=False)
bb = bb.fit_transform(corpus)
print(bb.toarray())
看一下TfidfVectorizer 源码,只留下主要代码就是这个,可以看到它继承了CountVectorizer类
class TfidfVectorizer(CountVectorizer):
def __init__(self, input='content', encoding='utf-8',
decode_error='strict', strip_accents=None, lowercase=True,
preprocessor=None, tokenizer=None, analyzer='word',
stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
ngram_range=(1, 1), max_df=1.0, min_df=1,
max_features=None, vocabulary=None, binary=False,
dtype=np.float64, norm='l2', use_idf=True, smooth_idf=True,
sublinear_tf=False):
super().__init__(
input=input, encoding=encoding, decode_error=decode_error,
strip_accents=strip_accents, lowercase=lowercase,
preprocessor=preprocessor, tokenizer=tokenizer, analyzer=analyzer,
stop_words=stop_words, token_pattern=token_pattern,
ngram_range=ngram_range, max_df=max_df, min_df=min_df,
max_features=max_features, vocabulary=vocabulary, binary=binary,
dtype=dtype)
self._tfidf = TfidfTransformer(norm=norm, use_idf=use_idf,
smooth_idf=smooth_idf,
sublinear_tf=sublinear_tf)
def fit_transform(self, raw_documents, y=None):
self._check_params()
X = super().fit_transform(raw_documents)
self._tfidf.fit(X)
return self._tfidf.transform(X, copy=False)
它先执行了CountVectorizer中的fit_transform,得到的X为前面看到的词频的矩阵(稀疏矩阵)
然后执行TfidfTransformer中的fit和transform
def fit(self, X, y=None):
X = check_array(X, accept_sparse=('csr', 'csc'))
if not sp.issparse(X):
X = sp.csr_matrix(X)
dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64
if self.use_idf:
n_samples, n_features = X.shape
df = _document_frequency(X)
# print(df) # [1 3 2 3 1 1 4 1 3]这里df为为每个词在所有文档中的出现频率,一共4条句子,最多是
df = df.astype(dtype, **_astype_copy_false(df))
#分子分母都+1。更平滑
df += int(self.smooth_idf) #self.smooth_idf为bool True的话就+1 False就不变
n_samples += int(self.smooth_idf)
#后面+1为了结果不为0
idf = np.log(n_samples / df) + 1 # 归一化 ,转换为概率形式
self._idf_diag = sp.diags(idf, offsets=0,
shape=(n_features, n_features),
format='csr',
dtype=dtype)
return self
def transform(self, X, copy=True):
X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy)
if not sp.issparse(X):
X = sp.csr_matrix(X, dtype=np.float64)
n_samples, n_features = X.shape
if self.sublinear_tf:
np.log(X.data, X.data)
X.data += 1
if self.use_idf:
check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')
expected_n_features = self._idf_diag.shape[0]
if n_features != expected_n_features:
raise ValueError("Input has n_features=%d while the model"
" has been trained with n_features=%d" % (
n_features, expected_n_features))
# *= doesn't work
X = X * self._idf_diag
if self.norm:
X = normalize(X, norm=self.norm, copy=False)
return X
在fit中 计算了一个idf矩阵,存到self._dif_diag中,然后在transform的时候 乘一下, df就是把CountVectorizer中的结果进行了一个l2归一化。
测试一下不用idf,use_idf为false的时候
from sklearn.preprocessing import normalize
vectorizer = CountVectorizer()
count = vectorizer.fit_transform(corpus)
print(normalize(count, 'l2').toarray())
#结果和
bb = TfidfVectorizer(use_idf=False)
bb = bb.fit_transform(corpus)
print(bb.toarray())
#结果一样