avatar

目录
文本分类之常用方法总结

1 TFIDF+逻辑回归

python
1
2
3
4
5
6
7
import codecs
import gensim
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import numpy as np
import xgboost as xgb
from tqdm import tqdm

1.1 导入数据

python
1
2
3
4
5
6
7
8
9
10
import codecs 

labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
document_split = f.readlines()
for document in document_split:
temp = document.split('\t')
labels.append(temp[0])
text.append(temp[1].strip())

1.2 标签转换为数字

python
1
2
3
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

1.3 TF-IDF提取文本特征

python
1
2
3
4
5
6
7
8
from sklearn.feature_extraction.text import TfidfVectorizer

tfv1 = TfidfVectorizer(min_df=4,
max_df=0.6)

# 使用TF-IDF来fit训练集和测试集(半监督学习)
tfv1.fit(text)
features = tfv1.transform(text)

查看分词数目

python
1
2

len(tfv1.get_feature_names())
84412

1.4 切分数据

python
1
2
3
4
5
from sklearn.model_selection import train_test_split
x_train_tfv, x_valid_tfv, y_train, y_valid = train_test_split(features, y,
stratify=y,
random_state=42,
test_size=0.1, shuffle=True)

1.5 使用模型分类

python
1
2
3
4
5
6
7
8
9
#利用提取的TFIDF特征来fit一个简单的Logistic Regression 

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C=1.0,solver='lbfgs',multi_class='multinomial')
clf.fit(x_train_tfv, y_train)
predictions = clf.predict_proba(x_valid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))
logloss: 0.564 


/home/ubuntu/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:758: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.
  "of iterations.", ConvergenceWarning)

2 WordCounts特征+逻辑回归

python
1
2
3
4
5
6
import codecs 
import numpy as np
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

2.1 导入数据

python
1
2
3
4
5
6
7
8
9

labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
document_split = f.readlines()
for document in document_split:
temp = document.split('\t')
labels.append(temp[0])
text.append(temp[1].strip())

2.2 标签转换为数字

python
1
2
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

2.3 提取的word counts特征

python
1
2
3
4
5
6
7
ctv = CountVectorizer(min_df=3,
max_df=0.5,
ngram_range=(1,2))

# 使用Count Vectorizer来fit训练集和测试集(半监督学习)
ctv.fit(text)
text_ctv = ctv.transform(text)

2.4 切分数据集

python
1
2
3
4
5
from sklearn.model_selection import train_test_split
x_train_ctv, x_valid_ctv, y_train, y_valid = train_test_split(text_ctv, y,
stratify=y,
random_state=42,
test_size=0.1, shuffle=True)

2.5 定义损失函数

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def multiclass_logloss(actual, predicted, eps=1e-15):
"""对数损失度量(Logarithmic Loss Metric)的多分类版本。
:param actual: 包含actual target classes的数组
:param predicted: 分类预测结果矩阵, 每个类别都有一个概率
"""
# Convert 'actual' to a binary array if it's not already:
if len(actual.shape) == 1:
actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
for i, val in enumerate(actual):
actual2[i, val] = 1
actual = actual2

clip = np.clip(predicted, eps, 1 - eps)
rows = actual.shape[0]
vsota = np.sum(actual * np.log(clip))
return -1.0 / rows * vsota

2.6 使用模型分类

python
1
2
3
4
5
6
7
#利用提取的word counts特征来fit一个简单的Logistic Regression 

clf = LogisticRegression(C=1.0,solver='lbfgs',multi_class='multinomial')
clf.fit(x_train_ctv, y_train)
predictions = clf.predict_proba(x_valid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))
logloss: 0.784 


/home/ubuntu/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:758: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.
  "of iterations.", ConvergenceWarning)

3 TF-IDF+朴素贝叶斯

python
1
2
3
4
5
6
import codecs 
import numpy as np
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

3.1 导入数据

python
1
2
3
4
5
6
7
8
labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
document_split = f.readlines()
for document in document_split:
temp = document.split('\t')
labels.append(temp[0])
text.append(temp[1].strip())

3.2 标签转换为数字

python
1
2
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

3.3 提取的word counts特征

python
1
2
3
4
5
6
7
ctv = CountVectorizer(min_df=3,
max_df=0.5,
ngram_range=(1,2))

# 使用Count Vectorizer来fit训练集和测试集(半监督学习)
ctv.fit(text)
text_ctv = ctv.transform(text)

3.4 切分数据集

python
1
2
3
4
5
from sklearn.model_selection import train_test_split
x_train_ctv, x_valid_ctv, y_train, y_valid = train_test_split(text_ctv, y,
stratify=y,
random_state=42,
test_size=0.1, shuffle=True)

3.5 定义损失函数

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def multiclass_logloss(actual, predicted, eps=1e-15):
"""对数损失度量(Logarithmic Loss Metric)的多分类版本。
:param actual: 包含actual target classes的数组
:param predicted: 分类预测结果矩阵, 每个类别都有一个概率
"""
# Convert 'actual' to a binary array if it's not already:
if len(actual.shape) == 1:
actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
for i, val in enumerate(actual):
actual2[i, val] = 1
actual = actual2

clip = np.clip(predicted, eps, 1 - eps)
rows = actual.shape[0]
vsota = np.sum(actual * np.log(clip))
return -1.0 / rows * vsota

3.6 使用模型分类

python
1
2
3
4
5
6
7
#利用提取的word counts特征来fit一个简单的Logistic Regression 

clf = LogisticRegression(C=1.0,solver='lbfgs',multi_class='multinomial')
clf.fit(x_train_ctv, y_train)
predictions = clf.predict_proba(x_valid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))
logloss: 0.784 


/home/ubuntu/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:758: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.
  "of iterations.", ConvergenceWarning)

4 WordCounts特征+朴素贝叶斯

python
1
2
3
4
5
6
import codecs 
import numpy as np
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

4.1 导入数据

python
1
2
3
4
5
6
7
8
9

labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
document_split = f.readlines()
for document in document_split:
temp = document.split('\t')
labels.append(temp[0])
text.append(temp[1].strip())

4.2 标签转换为数字

python
1
2
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

4.3 提取的word counts特征

python
1
2
3
4
5
6
7
ctv = CountVectorizer(min_df=3,
max_df=0.5,
ngram_range=(1,2))

# 使用Count Vectorizer来fit训练集和测试集(半监督学习)
ctv.fit(text)
text_ctv = ctv.transform(text)

4.4 切分数据集

python
1
2
3
4
5
from sklearn.model_selection import train_test_split
x_train_ctv, x_valid_ctv, y_train, y_valid = train_test_split(text_ctv, y,
stratify=y,
random_state=42,
test_size=0.1, shuffle=True)

4.5 定义损失函数

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def multiclass_logloss(actual, predicted, eps=1e-15):
"""对数损失度量(Logarithmic Loss Metric)的多分类版本。
:param actual: 包含actual target classes的数组
:param predicted: 分类预测结果矩阵, 每个类别都有一个概率
"""
# Convert 'actual' to a binary array if it's not already:
if len(actual.shape) == 1:
actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
for i, val in enumerate(actual):
actual2[i, val] = 1
actual = actual2

clip = np.clip(predicted, eps, 1 - eps)
rows = actual.shape[0]
vsota = np.sum(actual * np.log(clip))
return -1.0 / rows * vsota

4.6 使用模型分类

python
1
2
3
4
5
6
7
#利用提取的word counts特征来fit一个简单的Logistic Regression 

clf = MultinomialNB()
clf.fit(x_train_ctv, y_train)
predictions = clf.predict_proba(x_valid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))
logloss: 3.735 

5 TF-IDF的SVD特征+支持向量机

5.1 数据准备

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import codecs 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing, decomposition
from sklearn.svm import SVC

# 1 导入数据
labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
document_split = f.readlines()
for document in document_split:
temp = document.split('\t')
labels.append(temp[0])
text.append(temp[1].strip())

# 2 标签转换为数字
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)


# 3 TF-IDF提取文本特征
tfv1 = TfidfVectorizer(min_df=4,
max_df=0.6)
tfv1.fit(text)
features = tfv1.transform(text)


# 4 切分数据集
from sklearn.model_selection import train_test_split
x_train_tfv, x_valid_tfv, y_train, y_valid = train_test_split(features, y,
stratify=y,
random_state=42,
test_size=0.1, shuffle=True)

5.2 定义损失函数

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def multiclass_logloss(actual, predicted, eps=1e-15):
"""对数损失度量(Logarithmic Loss Metric)的多分类版本。
:param actual: 包含actual target classes的数组
:param predicted: 分类预测结果矩阵, 每个类别都有一个概率
"""
# Convert 'actual' to a binary array if it's not already:
if len(actual.shape) == 1:
actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
for i, val in enumerate(actual):
actual2[i, val] = 1
actual = actual2

clip = np.clip(predicted, eps, 1 - eps)
rows = actual.shape[0]
vsota = np.sum(actual * np.log(clip))
return -1.0 / rows * vsota

5.3 数据标准化

由于SVM需要花费大量时间,因此在应用SVM之前,我们将使用奇异值分解(Singular Value Decomposition )来减少TF-IDF中的特征数量。

同时,在使用SVM之前,我们还需要将数据标准化(Standardize Data )

python
1
2
3
4
5
6
7
8
9
10
11
#使用SVD进行降维,components设为120,对于SVM来说,SVD的components的合适调整区间一般为120~200 
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(x_train_tfv)
xtrain_svd = svd.transform(x_train_tfv)
xvalid_svd = svd.transform(x_valid_tfv)

#对从SVD获得的数据进行缩放
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

5.4 使用模型分类

python
1
2
3
4
5
6
# 调用下SVM模型
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, y_train)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))
logloss: 0.390 

6 网格搜索+朴素贝叶斯

网格搜索是一种超参数优化的技巧。 如果知道这个技巧,可以通过获取最优的参数组合来产生良好的文本分类效果。

下面讨论使用基于朴素贝叶斯模型的网格搜索。

python
1
2
3
4
5
6
7
8
9
10
import codecs
from sklearn import preprocessing,metrics, pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV
import numpy as np

from sklearn.naive_bayes import MultinomialNB

6.1 准备数据

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# 1 导入数据
labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
document_split = f.readlines()
for document in document_split:
temp = document.split('\t')
labels.append(temp[0])
text.append(temp[1].strip())

# 2 标签转换为数字
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)


# 3 TF-IDF提取文本特征
tfv1 = TfidfVectorizer(min_df=4,
max_df=0.6)
tfv1.fit(text)
features = tfv1.transform(text)


# 4 切分数据集
from sklearn.model_selection import train_test_split
x_train_tfv, x_valid_tfv, y_train, y_valid = train_test_split(features, y,
stratify=y,
random_state=42,
test_size=0.1, shuffle=True)

6.2 定义损失函数

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def multiclass_logloss(actual, predicted, eps=1e-15):
"""对数损失度量(Logarithmic Loss Metric)的多分类版本。
:param actual: 包含actual target classes的数组
:param predicted: 分类预测结果矩阵, 每个类别都有一个概率
"""
# Convert 'actual' to a binary array if it's not already:
if len(actual.shape) == 1:
actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
for i, val in enumerate(actual):
actual2[i, val] = 1
actual = actual2

clip = np.clip(predicted, eps, 1 - eps)
rows = actual.shape[0]
vsota = np.sum(actual * np.log(clip))
return -1.0 / rows * vsota

6.3 创建评分函数

在开始网格搜索之前,我们需要创建一个评分函数,这可以通过scikit-learn的make_scorer函数完成的。

python
1
2

mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)

6.4 进行网格搜索

以朴素贝叶斯算法为例

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
nb_model = MultinomialNB()

# 创建pipeline
clf = pipeline.Pipeline([('nb', nb_model)])

# 搜索参数设置
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# 网格搜索模型(Grid Search Model)初始化
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# fit网格搜索模型
model.fit(x_train_tfv, y_train) # 为了减少计算量,这里我们仅使用xtrain
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
Fitting 2 folds for each of 6 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:    1.9s remaining:    5.7s
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed:    2.0s remaining:    2.8s
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:    2.1s remaining:    1.5s


Best score: -0.560
Best parameters set:
    nb__alpha: 0.01


[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:    2.2s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    2.2s finished

6.5 用模型来分类

python
1
2
3
4
5
6
#利用提取的TFIDF特征来fit Naive Bayes
clf = MultinomialNB(alpha=0.01)
clf.fit(x_train_tfv, y_train)
predictions = clf.predict_proba(x_valid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))
logloss: 0.520 

7 word2vec的词嵌入+xgboost

python
1
2
3
4
5
6
7
import codecs
import gensim
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import numpy as np
import xgboost as xgb
from tqdm import tqdm

7.1 数据准备

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 读取数据
labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
document_split = f.readlines()
for document in document_split:
temp = document.split('\t')
labels.append(temp[0])
text.append(temp[1].strip())

# 标签转换为数字
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# 将每个句子切分成单个词
text_s2w= [s.split() for s in text]

7.2 构建word2vec模型

python
1
2
3
4
5
model = gensim.models.Word2Vec(text_s2w,
min_count=5,
workers=6,
window =8,
size=100)

参数说明:

  • min_count: 对于词频 < min_count 的单词,将舍弃(其实最合适的方法是用 UNK 符号代替,即所谓的『未登录词』,这里我们简化起见,认为此类低频词不重要,直接抛弃)

  • workers: 可以并行执行的核心数,需要安装 Cython 才能起作用(安装 Cython 的方法很简单,直接 pip install cython)

size: 词向量的维度,神经网络隐层节点数

  • window: 目标词汇的上下文单词距目标词的最长距离,很好理解,比如 CBOW 模型是用一个词的上下文预测这个词,那这个上下文总得有个限制,如果取得太多,距离目标词太远,有些词就没啥意义了,而如果取得太少,又信息不足,所以 window 就是上下文的一个最长距离

7.3 word2vec模型的简单使用

7.3.1 构建词建词嵌入字典

python
1
2
3

embeddings_index = dict(zip(model.wv.index2word, model.wv.vectors))
print('Found %s word vectors.' % len(embeddings_index))
Found 87117 word vectors.

7.3.2 获取某个词的向量

python
1
2

model['汽车']
/home/ubuntu/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
  """Entry point for launching an IPython kernel.

array([-2.240292  , -1.1615268 , -1.4746077 ,  2.1054246 ,  4.819405  ,
       -3.1492457 , -0.05073776, -2.1645617 , -1.2719896 ,  1.7608824 ,
       -0.2626409 , -0.64887804,  1.3482507 ,  0.34045577,  1.4765079 ,
       -3.445696  ,  1.449008  , -0.09463242,  0.6401563 , -1.6335047 ,
       -0.30473268,  2.6725786 , -0.1342183 ,  0.27526513, -2.4943345 ,
        0.27751288, -1.9030106 , -0.2115223 ,  0.48280153,  2.8040369 ,
        1.4369518 , -1.6659547 ,  0.6498365 ,  3.1322846 , -1.7274039 ,
       -0.4276681 ,  2.0273833 , -1.2563524 , -2.2891238 ,  0.80385494,
       -0.8380016 , -1.1951414 ,  0.21576834, -1.8307697 ,  1.4016038 ,
       -0.07672032,  0.97227174,  1.3520627 ,  0.568014  , -1.914469  ,
       -1.1551676 ,  0.7751831 ,  0.7154037 ,  1.2694645 ,  1.9431589 ,
       -0.06259096,  3.4280195 ,  0.6663932 , -2.665189  ,  0.6598596 ,
       -0.07868402, -0.5291124 ,  1.8237985 , -0.7853107 , -0.16555293,
       -2.074671  , -0.87207425,  0.7680195 ,  0.40575528,  0.29356548,
       -2.8064344 , -2.5557816 , -1.554487  , -2.7589092 , -0.35392886,
       -0.6011241 , -0.31734776, -1.1346784 ,  0.1052264 ,  0.57027906,
        1.1536218 ,  2.066991  , -1.1962171 ,  1.0027347 ,  0.40441233,
        2.2641828 , -2.0621223 ,  2.0815525 ,  3.5621598 , -0.4967822 ,
       -0.717848  ,  3.1545784 ,  1.1730249 ,  1.3114505 , -0.36371502,
       -0.41231316, -2.3199863 , -0.10876293, -0.44529822, -2.18213   ],
      dtype=float32)

7.3.3 查看某个词的与其他词的相似度

python
1
2

model.most_similar('人民日报')
/home/ubuntu/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  """Entry point for launching an IPython kernel.

[('光明日报', 0.8604782223701477),
 ('海外版', 0.8062193393707275),
 ('年月日', 0.7948733568191528),
 ('经济日报', 0.7898619174957275),
 ('文汇报', 0.7830426692962646),
 ('社论', 0.7795723676681519),
 ('评论员', 0.765376091003418),
 ('中国作协', 0.7639801502227783),
 ('讲话', 0.7555620670318604),
 ('第五次', 0.7492089867591858)]

7.3.4 保存模型

python
1
2

model.save('/tmp/w2v_model')

7.3.5 加载模型

python
1
2

model_load = gensim.models.Word2Vec.load('/tmp/w2v_model')

7.4 训练数据处理

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#该函数会将语句转化为一个标准化的向量(Normalized Vector)
def sent2vec(s):
"""
将每个句子转换会一个100的向量
"""
words = s.split()
M = []
for w in words:
try:
#M.append(embeddings_index[w])
M.append(model[w])
except:
continue
M = np.array(M) # shape=(x,100),x是句子中词的个数,100是每个词向量的维数
v = M.sum(axis=0) # 维度是100,对M中的x个数求和,得到每一维度的总和
if type(v) != np.ndarray:
return np.zeros(100)

return v / np.sqrt((v ** 2).sum()) # 正则化,最后每个句子都变为一100维的向量
python
1
2
3
4
5
6
7
8
9
10
11
12
# 对训练集和验证集使用上述函数,进行文本向量化处理
text_s2v = [sent2vec(s) for s in tqdm(text)]

# 转换成numpy array数组
text_s2v = np.array(text_s2v)

# 切分数据集
from sklearn.model_selection import train_test_split
x_train_w2v, x_valid_w2v, y_train, y_valid = train_test_split(text_s2v, y,
stratify=y,
random_state=42,
test_size=0.1, shuffle=True)
  0%|          | 0/9249 [00:00<?, ?it/s]/home/ubuntu/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:11: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 9249/9249 [01:11<00:00, 129.79it/s]

7.5 调用模型进行分类

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 定义损失函数
def multiclass_logloss(actual, predicted, eps=1e-15):
"""对数损失度量(Logarithmic Loss Metric)的多分类版本。
:param actual: 包含actual target classes的数组
:param predicted: 分类预测结果矩阵, 每个类别都有一个概率
"""
# Convert 'actual' to a binary array if it's not already:
if len(actual.shape) == 1:
actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
for i, val in enumerate(actual):
actual2[i, val] = 1
actual = actual2

clip = np.clip(predicted, eps, 1 - eps)
rows = actual.shape[0]
vsota = np.sum(actual * np.log(clip))
return -1.0 / rows * vsota
python
1
2
3
4
5
6
7
# 基于word2vec特征在一个简单的Xgboost模型上进行拟合
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8,
subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(x_train_w2v, y_train)
predictions = clf.predict_proba(x_valid_w2v)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))
logloss: 0.367 

8 WordCounts特征+xgboost

python
1
2
3
4
5
6
import codecs 
import numpy as np
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb

8.1 导入数据

python
1
2
3
4
5
6
7
8
9

labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
document_split = f.readlines()
for document in document_split:
temp = document.split('\t')
labels.append(temp[0])
text.append(temp[1].strip())

8.2 标签转换为数字

python
1
2
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

8.3 提取的word counts特征

python
1
2
3
4
5
6
7
ctv = CountVectorizer(min_df=3,
max_df=0.5,
ngram_range=(1,2))

# 使用Count Vectorizer来fit训练集和测试集(半监督学习)
ctv.fit(text)
text_ctv = ctv.transform(text)

8.4 切分数据集

python
1
2
3
4
5
from sklearn.model_selection import train_test_split
x_train_ctv, x_valid_ctv, y_train, y_valid = train_test_split(text_ctv, y,
stratify=y,
random_state=42,
test_size=0.1, shuffle=True)

8.5 定义损失函数

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def multiclass_logloss(actual, predicted, eps=1e-15):
"""对数损失度量(Logarithmic Loss Metric)的多分类版本。
:param actual: 包含actual target classes的数组
:param predicted: 分类预测结果矩阵, 每个类别都有一个概率
"""
# Convert 'actual' to a binary array if it's not already:
if len(actual.shape) == 1:
actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
for i, val in enumerate(actual):
actual2[i, val] = 1
actual = actual2

clip = np.clip(predicted, eps, 1 - eps)
rows = actual.shape[0]
vsota = np.sum(actual * np.log(clip))
return -1.0 / rows * vsota

8.6 使用模型分类

python
1
2
3
4
5
6
7
# 基于word counts特征,使用xgboost
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8,
subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(x_train_ctv, y_train)
predictions = clf.predict_proba(x_valid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))
logloss: 0.184 

9 TF-IDF的SVD特征+xgboost

9.1 数据准备

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30


# 1 导入数据
labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
document_split = f.readlines()
for document in document_split:
temp = document.split('\t')
labels.append(temp[0])
text.append(temp[1].strip())

# 2 标签转换为数字
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)


# 3 TF-IDF提取文本特征
tfv1 = TfidfVectorizer(min_df=4,
max_df=0.6)
tfv1.fit(text)
features = tfv1.transform(text)


# 4 切分数据集
from sklearn.model_selection import train_test_split
x_train_tfv, x_valid_tfv, y_train, y_valid = train_test_split(features, y,
stratify=y,
random_state=42,
test_size=0.1, shuffle=True)

9.2 数据标准化

由于SVM需要花费大量时间,因此在应用SVM之前,我们将使用奇异值分解(Singular Value Decomposition )来减少TF-IDF中的特征数量。

同时,在使用SVM之前,我们还需要将数据标准化(Standardize Data )

python
1
2
3
4
5
6
7
8
9
10
11
#使用SVD进行降维,components设为120,对于SVM来说,SVD的components的合适调整区间一般为120~200 
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(x_train_tfv)
xtrain_svd = svd.transform(x_train_tfv)
xvalid_svd = svd.transform(x_valid_tfv)

#对从SVD获得的数据进行缩放
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

9.3 定义损失函数

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def multiclass_logloss(actual, predicted, eps=1e-15):
"""对数损失度量(Logarithmic Loss Metric)的多分类版本。
:param actual: 包含actual target classes的数组
:param predicted: 分类预测结果矩阵, 每个类别都有一个概率
"""
# Convert 'actual' to a binary array if it's not already:
if len(actual.shape) == 1:
actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
for i, val in enumerate(actual):
actual2[i, val] = 1
actual = actual2

clip = np.clip(predicted, eps, 1 - eps)
rows = actual.shape[0]
vsota = np.sum(actual * np.log(clip))
return -1.0 / rows * vsota

9.4 使用模型分类

9.4.1 基于tf-idf的svd特征

基于tf-idf的svd特征,使用xgboost

python
1
2
3
4
5
clf = xgb.XGBClassifier(nthread=10)
clf.fit(xtrain_svd, y_train)
predictions = clf.predict_proba(xvalid_svd)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))
logloss: 0.385 

9.4.2 标准化的tf-idf-svd特征

对经过数据标准化(Scaling)的tf-idf-svd特征使用xgboost

python
1
2
3
4
5
clf = xgb.XGBClassifier(nthread=10)
clf.fit(xtrain_svd_scl, y_train)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))
logloss: 0.385 

10 TF-IDF+xgboost

python
1
2
3
4
import codecs 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

10.1 数据准备

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import xgboost as xgb

# 1 导入数据
labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
document_split = f.readlines()
for document in document_split:
temp = document.split('\t')
labels.append(temp[0])
text.append(temp[1].strip())

# 2 标签转换为数字
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)


# 3 TF-IDF提取文本特征
tfv1 = TfidfVectorizer(min_df=4,
max_df=0.6)
tfv1.fit(text)
features = tfv1.transform(text)


# 4 切分数据集
from sklearn.model_selection import train_test_split
x_train_tfv, x_valid_tfv, y_train, y_valid = train_test_split(features, y,
stratify=y,
random_state=42,
test_size=0.1, shuffle=True)

10.2 定义损失函数

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def multiclass_logloss(actual, predicted, eps=1e-15):
"""对数损失度量(Logarithmic Loss Metric)的多分类版本。
:param actual: 包含actual target classes的数组
:param predicted: 分类预测结果矩阵, 每个类别都有一个概率
"""
# Convert 'actual' to a binary array if it's not already:
if len(actual.shape) == 1:
actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
for i, val in enumerate(actual):
actual2[i, val] = 1
actual = actual2

clip = np.clip(predicted, eps, 1 - eps)
rows = actual.shape[0]
vsota = np.sum(actual * np.log(clip))
return -1.0 / rows * vsota

10.3 使用模型分类

python
1
2
3
4
5
6
7
# 基于tf-idf特征,使用xgboost
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8,
subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(x_train_tfv.tocsc(), y_train)
predictions = clf.predict_proba(x_valid_tfv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))
logloss: 0.225 
文章作者: foochane
文章链接: https://foochane.cn/article/2019091501.html
版权声明: 本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 foochane
打赏
  • 微信
    微信
  • 支付宝
    支付宝

评论