import codecs import gensim from sklearn import preprocessing from sklearn.preprocessing import LabelEncoder import numpy as np import xgboost as xgb from tqdm import tqdm
1.1 导入数据
python
1 2 3 4 5 6 7 8 9 10
import codecs
labels = [] text = [] with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f: document_split = f.readlines() for document in document_split: temp = document.split('\t') labels.append(temp[0]) text.append(temp[1].strip())
1.2 标签转换为数字
python
1 2 3
from sklearn.preprocessing import LabelEncoder label_encoder = LabelEncoder() y = label_encoder.fit_transform(labels)
1.3 TF-IDF提取文本特征
python
1 2 3 4 5 6 7 8
from sklearn.feature_extraction.text import TfidfVectorizer
tfv1 = TfidfVectorizer(min_df=4, max_df=0.6)
# 使用TF-IDF来fit训练集和测试集(半监督学习) tfv1.fit(text) features = tfv1.transform(text)
查看分词数目
python
1 2
len(tfv1.get_feature_names())
84412
1.4 切分数据
python
1 2 3 4 5
from sklearn.model_selection import train_test_split x_train_tfv, x_valid_tfv, y_train, y_valid = train_test_split(features, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)
1.5 使用模型分类
python
1 2 3 4 5 6 7 8 9
#利用提取的TFIDF特征来fit一个简单的Logistic Regression
from sklearn.linear_model import LogisticRegression
logloss: 0.564
/home/ubuntu/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:758: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.
"of iterations.", ConvergenceWarning)
2 WordCounts特征+逻辑回归
python
1 2 3 4 5 6
import codecs import numpy as np from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression
2.1 导入数据
python
1 2 3 4 5 6 7 8 9
labels = [] text = [] with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f: document_split = f.readlines() for document in document_split: temp = document.split('\t') labels.append(temp[0]) text.append(temp[1].strip())
2.2 标签转换为数字
python
1 2
label_encoder = LabelEncoder() y = label_encoder.fit_transform(labels)
from sklearn.model_selection import train_test_split x_train_ctv, x_valid_ctv, y_train, y_valid = train_test_split(text_ctv, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)
2.5 定义损失函数
python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
defmulticlass_logloss(actual, predicted, eps=1e-15): """对数损失度量(Logarithmic Loss Metric)的多分类版本。 :param actual: 包含actual target classes的数组 :param predicted: 分类预测结果矩阵, 每个类别都有一个概率 """ # Convert 'actual' to a binary array if it's not already: if len(actual.shape) == 1: actual2 = np.zeros((actual.shape[0], predicted.shape[1])) for i, val in enumerate(actual): actual2[i, val] = 1 actual = actual2
logloss: 0.784
/home/ubuntu/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:758: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.
"of iterations.", ConvergenceWarning)
3 TF-IDF+朴素贝叶斯
python
1 2 3 4 5 6
import codecs import numpy as np from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression
3.1 导入数据
python
1 2 3 4 5 6 7 8
labels = [] text = [] with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f: document_split = f.readlines() for document in document_split: temp = document.split('\t') labels.append(temp[0]) text.append(temp[1].strip())
3.2 标签转换为数字
python
1 2
label_encoder = LabelEncoder() y = label_encoder.fit_transform(labels)
from sklearn.model_selection import train_test_split x_train_ctv, x_valid_ctv, y_train, y_valid = train_test_split(text_ctv, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)
3.5 定义损失函数
python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
defmulticlass_logloss(actual, predicted, eps=1e-15): """对数损失度量(Logarithmic Loss Metric)的多分类版本。 :param actual: 包含actual target classes的数组 :param predicted: 分类预测结果矩阵, 每个类别都有一个概率 """ # Convert 'actual' to a binary array if it's not already: if len(actual.shape) == 1: actual2 = np.zeros((actual.shape[0], predicted.shape[1])) for i, val in enumerate(actual): actual2[i, val] = 1 actual = actual2
logloss: 0.784
/home/ubuntu/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:758: ConvergenceWarning: lbfgs failed to converge. Increase the number of iterations.
"of iterations.", ConvergenceWarning)
4 WordCounts特征+朴素贝叶斯
python
1 2 3 4 5 6
import codecs import numpy as np from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB
4.1 导入数据
python
1 2 3 4 5 6 7 8 9
labels = [] text = [] with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f: document_split = f.readlines() for document in document_split: temp = document.split('\t') labels.append(temp[0]) text.append(temp[1].strip())
4.2 标签转换为数字
python
1 2
label_encoder = LabelEncoder() y = label_encoder.fit_transform(labels)
from sklearn.model_selection import train_test_split x_train_ctv, x_valid_ctv, y_train, y_valid = train_test_split(text_ctv, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)
4.5 定义损失函数
python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
defmulticlass_logloss(actual, predicted, eps=1e-15): """对数损失度量(Logarithmic Loss Metric)的多分类版本。 :param actual: 包含actual target classes的数组 :param predicted: 分类预测结果矩阵, 每个类别都有一个概率 """ # Convert 'actual' to a binary array if it's not already: if len(actual.shape) == 1: actual2 = np.zeros((actual.shape[0], predicted.shape[1])) for i, val in enumerate(actual): actual2[i, val] = 1 actual = actual2
import codecs import numpy as np from sklearn.preprocessing import LabelEncoder from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import preprocessing, decomposition from sklearn.svm import SVC
# 1 导入数据 labels = [] text = [] with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f: document_split = f.readlines() for document in document_split: temp = document.split('\t') labels.append(temp[0]) text.append(temp[1].strip())
# 2 标签转换为数字 label_encoder = LabelEncoder() y = label_encoder.fit_transform(labels)
# 4 切分数据集 from sklearn.model_selection import train_test_split x_train_tfv, x_valid_tfv, y_train, y_valid = train_test_split(features, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)
5.2 定义损失函数
python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
defmulticlass_logloss(actual, predicted, eps=1e-15): """对数损失度量(Logarithmic Loss Metric)的多分类版本。 :param actual: 包含actual target classes的数组 :param predicted: 分类预测结果矩阵, 每个类别都有一个概率 """ # Convert 'actual' to a binary array if it's not already: if len(actual.shape) == 1: actual2 = np.zeros((actual.shape[0], predicted.shape[1])) for i, val in enumerate(actual): actual2[i, val] = 1 actual = actual2
import codecs from sklearn import preprocessing,metrics, pipeline from sklearn.preprocessing import LabelEncoder from sklearn.metrics import classification_report from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD from sklearn.model_selection import GridSearchCV import numpy as np
# 1 导入数据 labels = [] text = [] with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f: document_split = f.readlines() for document in document_split: temp = document.split('\t') labels.append(temp[0]) text.append(temp[1].strip())
# 2 标签转换为数字 label_encoder = LabelEncoder() y = label_encoder.fit_transform(labels)
# 4 切分数据集 from sklearn.model_selection import train_test_split x_train_tfv, x_valid_tfv, y_train, y_valid = train_test_split(features, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)
6.2 定义损失函数
python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
defmulticlass_logloss(actual, predicted, eps=1e-15): """对数损失度量(Logarithmic Loss Metric)的多分类版本。 :param actual: 包含actual target classes的数组 :param predicted: 分类预测结果矩阵, 每个类别都有一个概率 """ # Convert 'actual' to a binary array if it's not already: if len(actual.shape) == 1: actual2 = np.zeros((actual.shape[0], predicted.shape[1])) for i, val in enumerate(actual): actual2[i, val] = 1 actual = actual2
import codecs import gensim from sklearn import preprocessing from sklearn.preprocessing import LabelEncoder import numpy as np import xgboost as xgb from tqdm import tqdm
7.1 数据准备
python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
# 读取数据 labels = [] text = [] with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f: document_split = f.readlines() for document in document_split: temp = document.split('\t') labels.append(temp[0]) text.append(temp[1].strip())
# 标签转换为数字 label_encoder = LabelEncoder() y = label_encoder.fit_transform(labels)
# 将每个句子切分成单个词 text_s2w= [s.split() for s in text]
7.2 构建word2vec模型
python
1 2 3 4 5
model = gensim.models.Word2Vec(text_s2w, min_count=5, workers=6, window =8, size=100)
#该函数会将语句转化为一个标准化的向量(Normalized Vector) defsent2vec(s): """ 将每个句子转换会一个100的向量 """ words = s.split() M = [] for w in words: try: #M.append(embeddings_index[w]) M.append(model[w]) except: continue M = np.array(M) # shape=(x,100),x是句子中词的个数,100是每个词向量的维数 v = M.sum(axis=0) # 维度是100,对M中的x个数求和,得到每一维度的总和 if type(v) != np.ndarray: return np.zeros(100) return v / np.sqrt((v ** 2).sum()) # 正则化,最后每个句子都变为一100维的向量
python
1 2 3 4 5 6 7 8 9 10 11 12
# 对训练集和验证集使用上述函数,进行文本向量化处理 text_s2v = [sent2vec(s) for s in tqdm(text)]
# 转换成numpy array数组 text_s2v = np.array(text_s2v)
# 切分数据集 from sklearn.model_selection import train_test_split x_train_w2v, x_valid_w2v, y_train, y_valid = train_test_split(text_s2v, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)
0%| | 0/9249 [00:00<?, ?it/s]/home/ubuntu/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:11: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
# This is added back by InteractiveShellApp.init_path()
100%|██████████| 9249/9249 [01:11<00:00, 129.79it/s]
7.5 调用模型进行分类
python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
# 定义损失函数 defmulticlass_logloss(actual, predicted, eps=1e-15): """对数损失度量(Logarithmic Loss Metric)的多分类版本。 :param actual: 包含actual target classes的数组 :param predicted: 分类预测结果矩阵, 每个类别都有一个概率 """ # Convert 'actual' to a binary array if it's not already: if len(actual.shape) == 1: actual2 = np.zeros((actual.shape[0], predicted.shape[1])) for i, val in enumerate(actual): actual2[i, val] = 1 actual = actual2
import codecs import numpy as np from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer import xgboost as xgb
8.1 导入数据
python
1 2 3 4 5 6 7 8 9
labels = [] text = [] with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f: document_split = f.readlines() for document in document_split: temp = document.split('\t') labels.append(temp[0]) text.append(temp[1].strip())
8.2 标签转换为数字
python
1 2
label_encoder = LabelEncoder() y = label_encoder.fit_transform(labels)
from sklearn.model_selection import train_test_split x_train_ctv, x_valid_ctv, y_train, y_valid = train_test_split(text_ctv, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)
8.5 定义损失函数
python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
defmulticlass_logloss(actual, predicted, eps=1e-15): """对数损失度量(Logarithmic Loss Metric)的多分类版本。 :param actual: 包含actual target classes的数组 :param predicted: 分类预测结果矩阵, 每个类别都有一个概率 """ # Convert 'actual' to a binary array if it's not already: if len(actual.shape) == 1: actual2 = np.zeros((actual.shape[0], predicted.shape[1])) for i, val in enumerate(actual): actual2[i, val] = 1 actual = actual2
# 1 导入数据 labels = [] text = [] with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f: document_split = f.readlines() for document in document_split: temp = document.split('\t') labels.append(temp[0]) text.append(temp[1].strip())
# 2 标签转换为数字 label_encoder = LabelEncoder() y = label_encoder.fit_transform(labels)
defmulticlass_logloss(actual, predicted, eps=1e-15): """对数损失度量(Logarithmic Loss Metric)的多分类版本。 :param actual: 包含actual target classes的数组 :param predicted: 分类预测结果矩阵, 每个类别都有一个概率 """ # Convert 'actual' to a binary array if it's not already: if len(actual.shape) == 1: actual2 = np.zeros((actual.shape[0], predicted.shape[1])) for i, val in enumerate(actual): actual2[i, val] = 1 actual = actual2
# 1 导入数据 labels = [] text = [] with codecs.open('output/data_clean_split.txt','r',encoding='utf-8') as f: document_split = f.readlines() for document in document_split: temp = document.split('\t') labels.append(temp[0]) text.append(temp[1].strip())
# 2 标签转换为数字 label_encoder = LabelEncoder() y = label_encoder.fit_transform(labels)
# 4 切分数据集 from sklearn.model_selection import train_test_split x_train_tfv, x_valid_tfv, y_train, y_valid = train_test_split(features, y, stratify=y, random_state=42, test_size=0.1, shuffle=True)
10.2 定义损失函数
python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
defmulticlass_logloss(actual, predicted, eps=1e-15): """对数损失度量(Logarithmic Loss Metric)的多分类版本。 :param actual: 包含actual target classes的数组 :param predicted: 分类预测结果矩阵, 每个类别都有一个概率 """ # Convert 'actual' to a binary array if it's not already: if len(actual.shape) == 1: actual2 = np.zeros((actual.shape[0], predicted.shape[1])) for i, val in enumerate(actual): actual2[i, val] = 1 actual = actual2