文本分类之常用方法总结

1 TFIDF+逻辑回归

python

import codecs
import gensim
from sklearn import  preprocessing
from sklearn.preprocessing import LabelEncoder
import numpy as np
import xgboost as xgb
from tqdm import tqdm

1.1 导入数据

python

import codecs 

labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
    document_split = f.readlines()
    for document in document_split:
        temp = document.split('\t')
        labels.append(temp[0])
        text.append(temp[1].strip())

1.2 标签转换为数字

python

1
2
3

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

1.3 TF-IDF提取文本特征

python

from sklearn.feature_extraction.text import TfidfVectorizer

tfv1 = TfidfVectorizer(min_df=4,  
                       max_df=0.6)

# 使用TF-IDF来fit训练集和测试集（半监督学习）
tfv1.fit(text)
features = tfv1.transform(text)

查看分词数目

python

1 2	len(tfv1.get_feature_names())

1.4 切分数据

python

from sklearn.model_selection import train_test_split
x_train_tfv, x_valid_tfv, y_train, y_valid = train_test_split(features, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

1.5 使用模型分类

python

#利用提取的TFIDF特征来fit一个简单的Logistic Regression 

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C=1.0,solver='lbfgs',multi_class='multinomial')
clf.fit(x_train_tfv, y_train)
predictions = clf.predict_proba(x_valid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))

logloss: 0.564 


/home/ubuntu/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:758: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.
  "of iterations.", ConvergenceWarning)

2 WordCounts特征+逻辑回归

python

import codecs 
import numpy as np
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

2.1 导入数据

python


labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
    document_split = f.readlines()
    for document in document_split:
        temp = document.split('\t')
        labels.append(temp[0])
        text.append(temp[1].strip())

2.2 标签转换为数字

python

1 2	label_encoder = LabelEncoder() y = label_encoder.fit_transform(labels)

2.3 提取的word counts特征

python

ctv = CountVectorizer(min_df=3,
                      max_df=0.5,
                      ngram_range=(1,2))

# 使用Count Vectorizer来fit训练集和测试集（半监督学习）
ctv.fit(text)
text_ctv = ctv.transform(text)

2.4 切分数据集

python

from sklearn.model_selection import train_test_split
x_train_ctv, x_valid_ctv, y_train, y_valid = train_test_split(text_ctv, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

2.5 定义损失函数

python

def multiclass_logloss(actual, predicted, eps=1e-15):
    """对数损失度量（Logarithmic Loss  Metric）的多分类版本。
    :param actual: 包含actual target classes的数组
    :param predicted: 分类预测结果矩阵, 每个类别都有一个概率
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

2.6 使用模型分类

python

#利用提取的word counts特征来fit一个简单的Logistic Regression 

clf = LogisticRegression(C=1.0,solver='lbfgs',multi_class='multinomial')
clf.fit(x_train_ctv, y_train)
predictions = clf.predict_proba(x_valid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))

logloss: 0.784 


/home/ubuntu/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:758: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.
  "of iterations.", ConvergenceWarning)

3 TF-IDF+朴素贝叶斯

python

import codecs 
import numpy as np
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

3.1 导入数据

python

labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
    document_split = f.readlines()
    for document in document_split:
        temp = document.split('\t')
        labels.append(temp[0])
        text.append(temp[1].strip())

3.2 标签转换为数字

python

1 2	label_encoder = LabelEncoder() y = label_encoder.fit_transform(labels)

3.3 提取的word counts特征

python

ctv = CountVectorizer(min_df=3,
                      max_df=0.5,
                      ngram_range=(1,2))

# 使用Count Vectorizer来fit训练集和测试集（半监督学习）
ctv.fit(text)
text_ctv = ctv.transform(text)

3.4 切分数据集

python

from sklearn.model_selection import train_test_split
x_train_ctv, x_valid_ctv, y_train, y_valid = train_test_split(text_ctv, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

3.5 定义损失函数

python

def multiclass_logloss(actual, predicted, eps=1e-15):
    """对数损失度量（Logarithmic Loss  Metric）的多分类版本。
    :param actual: 包含actual target classes的数组
    :param predicted: 分类预测结果矩阵, 每个类别都有一个概率
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

3.6 使用模型分类

python

#利用提取的word counts特征来fit一个简单的Logistic Regression 

clf = LogisticRegression(C=1.0,solver='lbfgs',multi_class='multinomial')
clf.fit(x_train_ctv, y_train)
predictions = clf.predict_proba(x_valid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))

logloss: 0.784 


/home/ubuntu/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:758: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.
  "of iterations.", ConvergenceWarning)

4 WordCounts特征+朴素贝叶斯

python

import codecs 
import numpy as np
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

4.1 导入数据

python


labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
    document_split = f.readlines()
    for document in document_split:
        temp = document.split('\t')
        labels.append(temp[0])
        text.append(temp[1].strip())

4.2 标签转换为数字

python

1 2	label_encoder = LabelEncoder() y = label_encoder.fit_transform(labels)

4.3 提取的word counts特征

python

ctv = CountVectorizer(min_df=3,
                      max_df=0.5,
                      ngram_range=(1,2))

# 使用Count Vectorizer来fit训练集和测试集（半监督学习）
ctv.fit(text)
text_ctv = ctv.transform(text)

4.4 切分数据集

python

from sklearn.model_selection import train_test_split
x_train_ctv, x_valid_ctv, y_train, y_valid = train_test_split(text_ctv, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

4.5 定义损失函数

python

def multiclass_logloss(actual, predicted, eps=1e-15):
    """对数损失度量（Logarithmic Loss  Metric）的多分类版本。
    :param actual: 包含actual target classes的数组
    :param predicted: 分类预测结果矩阵, 每个类别都有一个概率
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

4.6 使用模型分类

python

#利用提取的word counts特征来fit一个简单的Logistic Regression 

clf = MultinomialNB()
clf.fit(x_train_ctv, y_train)
predictions = clf.predict_proba(x_valid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))

logloss: 3.735

5 TF-IDF的SVD特征+支持向量机

5.1 数据准备

python

import codecs 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing, decomposition
from sklearn.svm import SVC

# 1 导入数据
labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
    document_split = f.readlines()
    for document in document_split:
        temp = document.split('\t')
        labels.append(temp[0])
        text.append(temp[1].strip())  

# 2 标签转换为数字
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)


# 3 TF-IDF提取文本特征
tfv1 = TfidfVectorizer(min_df=4,  
                       max_df=0.6)
tfv1.fit(text)
features = tfv1.transform(text)


# 4 切分数据集
from sklearn.model_selection import train_test_split
x_train_tfv, x_valid_tfv, y_train, y_valid = train_test_split(features, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

5.2 定义损失函数

python

def multiclass_logloss(actual, predicted, eps=1e-15):
    """对数损失度量（Logarithmic Loss  Metric）的多分类版本。
    :param actual: 包含actual target classes的数组
    :param predicted: 分类预测结果矩阵, 每个类别都有一个概率
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

5.3 数据标准化

由于SVM需要花费大量时间，因此在应用SVM之前，我们将使用奇异值分解（Singular Value Decomposition ）来减少TF-IDF中的特征数量。

同时，在使用SVM之前，我们还需要将数据标准化（Standardize Data ）

python

#使用SVD进行降维，components设为120，对于SVM来说，SVD的components的合适调整区间一般为120~200 
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(x_train_tfv)
xtrain_svd = svd.transform(x_train_tfv)
xvalid_svd = svd.transform(x_valid_tfv)

#对从SVD获得的数据进行缩放
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

5.4 使用模型分类

python

# 调用下SVM模型
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, y_train)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))

logloss: 0.390

6 网格搜索+朴素贝叶斯

网格搜索是一种超参数优化的技巧。如果知道这个技巧，可以通过获取最优的参数组合来产生良好的文本分类效果。

下面讨论使用基于朴素贝叶斯模型的网格搜索。

python

import codecs
from sklearn import  preprocessing,metrics, pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV
import numpy as np

from sklearn.naive_bayes import MultinomialNB

6.1 准备数据

python

# 1 导入数据
labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
    document_split = f.readlines()
    for document in document_split:
        temp = document.split('\t')
        labels.append(temp[0])
        text.append(temp[1].strip())  

# 2 标签转换为数字
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)


# 3 TF-IDF提取文本特征
tfv1 = TfidfVectorizer(min_df=4,  
                       max_df=0.6)
tfv1.fit(text)
features = tfv1.transform(text)


# 4 切分数据集
from sklearn.model_selection import train_test_split
x_train_tfv, x_valid_tfv, y_train, y_valid = train_test_split(features, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

6.2 定义损失函数

python

def multiclass_logloss(actual, predicted, eps=1e-15):
    """对数损失度量（Logarithmic Loss  Metric）的多分类版本。
    :param actual: 包含actual target classes的数组
    :param predicted: 分类预测结果矩阵, 每个类别都有一个概率
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

6.3 创建评分函数

在开始网格搜索之前，我们需要创建一个评分函数，这可以通过scikit-learn的make_scorer函数完成的。

python

1 2	mll_scorer = metrics.make_scorer(multiclass_logloss, greater_is_better=False, needs_proba=True)

6.4 进行网格搜索

以朴素贝叶斯算法为例

python

nb_model = MultinomialNB()

# 创建pipeline 
clf = pipeline.Pipeline([('nb', nb_model)])

# 搜索参数设置
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# 网格搜索模型（Grid Search Model）初始化
model = GridSearchCV(estimator=clf, param_grid=param_grid, scoring=mll_scorer,
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# fit网格搜索模型
model.fit(x_train_tfv, y_train)  # 为了减少计算量，这里我们仅使用xtrain
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 6 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:    1.9s remaining:    5.7s
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed:    2.0s remaining:    2.8s
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:    2.1s remaining:    1.5s


Best score: -0.560
Best parameters set:
    nb__alpha: 0.01


[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:    2.2s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    2.2s finished

6.5 用模型来分类

python

#利用提取的TFIDF特征来fit Naive Bayes
clf = MultinomialNB(alpha=0.01)
clf.fit(x_train_tfv, y_train)
predictions = clf.predict_proba(x_valid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))

logloss: 0.520

7 word2vec的词嵌入+xgboost

python

import codecs
import gensim
from sklearn import  preprocessing
from sklearn.preprocessing import LabelEncoder
import numpy as np
import xgboost as xgb
from tqdm import tqdm

7.1 数据准备

python

# 读取数据
labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
    document_split = f.readlines()
    for document in document_split:
        temp = document.split('\t')
        labels.append(temp[0])
        text.append(temp[1].strip())  

# 标签转换为数字
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# 将每个句子切分成单个词
text_s2w= [s.split() for s in text]

7.2 构建word2vec模型

python

model = gensim.models.Word2Vec(text_s2w,
                               min_count=5,
                               workers=6,
                               window =8,
                               size=100)

参数说明：

min_count: 对于词频 < min_count 的单词，将舍弃（其实最合适的方法是用 UNK 符号代替，即所谓的『未登录词』，这里我们简化起见，认为此类低频词不重要，直接抛弃）
workers: 可以并行执行的核心数，需要安装 Cython 才能起作用（安装 Cython 的方法很简单，直接 pip install cython）

size: 词向量的维度，神经网络隐层节点数

window: 目标词汇的上下文单词距目标词的最长距离，很好理解，比如 CBOW 模型是用一个词的上下文预测这个词，那这个上下文总得有个限制，如果取得太多，距离目标词太远，有些词就没啥意义了，而如果取得太少，又信息不足，所以 window 就是上下文的一个最长距离

7.3 word2vec模型的简单使用

7.3.1 构建词建词嵌入字典

python

1
2
3


embeddings_index = dict(zip(model.wv.index2word, model.wv.vectors))
print('Found %s word vectors.' % len(embeddings_index))

Found 87117 word vectors.

7.3.2 获取某个词的向量

python

1 2	model['汽车']

/home/ubuntu/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
  """Entry point for launching an IPython kernel.

array([-2.240292  , -1.1615268 , -1.4746077 ,  2.1054246 ,  4.819405  ,
       -3.1492457 , -0.05073776, -2.1645617 , -1.2719896 ,  1.7608824 ,
       -0.2626409 , -0.64887804,  1.3482507 ,  0.34045577,  1.4765079 ,
       -3.445696  ,  1.449008  , -0.09463242,  0.6401563 , -1.6335047 ,
       -0.30473268,  2.6725786 , -0.1342183 ,  0.27526513, -2.4943345 ,
        0.27751288, -1.9030106 , -0.2115223 ,  0.48280153,  2.8040369 ,
        1.4369518 , -1.6659547 ,  0.6498365 ,  3.1322846 , -1.7274039 ,
       -0.4276681 ,  2.0273833 , -1.2563524 , -2.2891238 ,  0.80385494,
       -0.8380016 , -1.1951414 ,  0.21576834, -1.8307697 ,  1.4016038 ,
       -0.07672032,  0.97227174,  1.3520627 ,  0.568014  , -1.914469  ,
       -1.1551676 ,  0.7751831 ,  0.7154037 ,  1.2694645 ,  1.9431589 ,
       -0.06259096,  3.4280195 ,  0.6663932 , -2.665189  ,  0.6598596 ,
       -0.07868402, -0.5291124 ,  1.8237985 , -0.7853107 , -0.16555293,
       -2.074671  , -0.87207425,  0.7680195 ,  0.40575528,  0.29356548,
       -2.8064344 , -2.5557816 , -1.554487  , -2.7589092 , -0.35392886,
       -0.6011241 , -0.31734776, -1.1346784 ,  0.1052264 ,  0.57027906,
        1.1536218 ,  2.066991  , -1.1962171 ,  1.0027347 ,  0.40441233,
        2.2641828 , -2.0621223 ,  2.0815525 ,  3.5621598 , -0.4967822 ,
       -0.717848  ,  3.1545784 ,  1.1730249 ,  1.3114505 , -0.36371502,
       -0.41231316, -2.3199863 , -0.10876293, -0.44529822, -2.18213   ],
      dtype=float32)

7.3.3 查看某个词的与其他词的相似度

python

1 2	model.most_similar('人民日报')

/home/ubuntu/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  """Entry point for launching an IPython kernel.

[('光明日报', 0.8604782223701477),
 ('海外版', 0.8062193393707275),
 ('年月日', 0.7948733568191528),
 ('经济日报', 0.7898619174957275),
 ('文汇报', 0.7830426692962646),
 ('社论', 0.7795723676681519),
 ('评论员', 0.765376091003418),
 ('中国作协', 0.7639801502227783),
 ('讲话', 0.7555620670318604),
 ('第五次', 0.7492089867591858)]

7.3.4 保存模型

python

1 2	model.save('/tmp/w2v_model')

7.3.5 加载模型

python

1 2	model_load = gensim.models.Word2Vec.load('/tmp/w2v_model')

7.4 训练数据处理

python

#该函数会将语句转化为一个标准化的向量（Normalized Vector）
def sent2vec(s):
    """
    将每个句子转换会一个100的向量
    """
    words = s.split()
    M = []
    for w in words:
        try:
            #M.append(embeddings_index[w])
            M.append(model[w])
        except:
            continue
    M = np.array(M)  # shape=(x,100),x是句子中词的个数，100是每个词向量的维数
    v = M.sum(axis=0) # 维度是100，对M中的x个数求和，得到每一维度的总和
    if type(v) != np.ndarray: 
        return np.zeros(100)
    
    return v / np.sqrt((v ** 2).sum()) # 正则化，最后每个句子都变为一100维的向量

python

# 对训练集和验证集使用上述函数，进行文本向量化处理
text_s2v = [sent2vec(s) for s in tqdm(text)]

# 转换成numpy array数组
text_s2v = np.array(text_s2v)

# 切分数据集
from sklearn.model_selection import train_test_split
x_train_w2v, x_valid_w2v, y_train, y_valid = train_test_split(text_s2v, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

  0%|          | 0/9249 [00:00<?, ?it/s]/home/ubuntu/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:11: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 9249/9249 [01:11<00:00, 129.79it/s]

7.5 调用模型进行分类

python

# 定义损失函数
def multiclass_logloss(actual, predicted, eps=1e-15):
    """对数损失度量（Logarithmic Loss  Metric）的多分类版本。
    :param actual: 包含actual target classes的数组
    :param predicted: 分类预测结果矩阵, 每个类别都有一个概率
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

python

# 基于word2vec特征在一个简单的Xgboost模型上进行拟合
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(x_train_w2v, y_train)
predictions = clf.predict_proba(x_valid_w2v)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))

logloss: 0.367

8 WordCounts特征+xgboost

python

import codecs 
import numpy as np
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb

8.1 导入数据

python


labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
    document_split = f.readlines()
    for document in document_split:
        temp = document.split('\t')
        labels.append(temp[0])
        text.append(temp[1].strip())

8.2 标签转换为数字

python

1 2	label_encoder = LabelEncoder() y = label_encoder.fit_transform(labels)

8.3 提取的word counts特征

python

ctv = CountVectorizer(min_df=3,
                      max_df=0.5,
                      ngram_range=(1,2))

# 使用Count Vectorizer来fit训练集和测试集（半监督学习）
ctv.fit(text)
text_ctv = ctv.transform(text)

8.4 切分数据集

python

from sklearn.model_selection import train_test_split
x_train_ctv, x_valid_ctv, y_train, y_valid = train_test_split(text_ctv, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

8.5 定义损失函数

python

def multiclass_logloss(actual, predicted, eps=1e-15):
    """对数损失度量（Logarithmic Loss  Metric）的多分类版本。
    :param actual: 包含actual target classes的数组
    :param predicted: 分类预测结果矩阵, 每个类别都有一个概率
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

8.6 使用模型分类

python

# 基于word counts特征，使用xgboost
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(x_train_ctv, y_train)
predictions = clf.predict_proba(x_valid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))

logloss: 0.184

9 TF-IDF的SVD特征+xgboost

9.1 数据准备

python



# 1 导入数据
labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
    document_split = f.readlines()
    for document in document_split:
        temp = document.split('\t')
        labels.append(temp[0])
        text.append(temp[1].strip())  

# 2 标签转换为数字
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)


# 3 TF-IDF提取文本特征
tfv1 = TfidfVectorizer(min_df=4,  
                       max_df=0.6)
tfv1.fit(text)
features = tfv1.transform(text)


# 4 切分数据集
from sklearn.model_selection import train_test_split
x_train_tfv, x_valid_tfv, y_train, y_valid = train_test_split(features, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

9.2 数据标准化

由于SVM需要花费大量时间，因此在应用SVM之前，我们将使用奇异值分解（Singular Value Decomposition ）来减少TF-IDF中的特征数量。

同时，在使用SVM之前，我们还需要将数据标准化（Standardize Data ）

python

#使用SVD进行降维，components设为120，对于SVM来说，SVD的components的合适调整区间一般为120~200 
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(x_train_tfv)
xtrain_svd = svd.transform(x_train_tfv)
xvalid_svd = svd.transform(x_valid_tfv)

#对从SVD获得的数据进行缩放
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

9.3 定义损失函数

python

def multiclass_logloss(actual, predicted, eps=1e-15):
    """对数损失度量（Logarithmic Loss  Metric）的多分类版本。
    :param actual: 包含actual target classes的数组
    :param predicted: 分类预测结果矩阵, 每个类别都有一个概率
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

9.4 使用模型分类

9.4.1 基于tf-idf的svd特征

基于tf-idf的svd特征，使用xgboost

python

clf = xgb.XGBClassifier(nthread=10)
clf.fit(xtrain_svd, y_train)
predictions = clf.predict_proba(xvalid_svd)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))

logloss: 0.385

9.4.2 标准化的tf-idf-svd特征

对经过数据标准化(Scaling)的tf-idf-svd特征使用xgboost

python

clf = xgb.XGBClassifier(nthread=10)
clf.fit(xtrain_svd_scl, y_train)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))

logloss: 0.385

10 TF-IDF+xgboost

python

import codecs 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

10.1 数据准备

python

import xgboost as xgb

# 1 导入数据
labels = []
text = []
with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f:
    document_split = f.readlines()
    for document in document_split:
        temp = document.split('\t')
        labels.append(temp[0])
        text.append(temp[1].strip())  

# 2 标签转换为数字
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)


# 3 TF-IDF提取文本特征
tfv1 = TfidfVectorizer(min_df=4,  
                       max_df=0.6)
tfv1.fit(text)
features = tfv1.transform(text)


# 4 切分数据集
from sklearn.model_selection import train_test_split
x_train_tfv, x_valid_tfv, y_train, y_valid = train_test_split(features, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, shuffle=True)

10.2 定义损失函数

python

def multiclass_logloss(actual, predicted, eps=1e-15):
    """对数损失度量（Logarithmic Loss  Metric）的多分类版本。
    :param actual: 包含actual target classes的数组
    :param predicted: 分类预测结果矩阵, 每个类别都有一个概率
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

10.3 使用模型分类

python

# 基于tf-idf特征，使用xgboost
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(x_train_tfv.tocsc(), y_train)
predictions = clf.predict_proba(x_valid_tfv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(y_valid, predictions))

logloss: 0.225