import pandas as pd from nltk.corpus import stopwords from nltk.stem import PorterStemmer from textblob import Word import re from sklearn.model_selection import train_test_split
2.1 读取数据
python
1 2
# 读取数据 data = pd.read_csv('spam.csv', encoding = "ISO-8859-1")
'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat '
2.5 单词转换为小写
python
1 2 3 4 5
# 单词转换为小写 data['text'] = data['text'].apply(lambda x:" ".join(x.lower() for x in x.split())) # 或者 #data['text'] = data['text'].apply(lambda x:x.lower()) data['text'][0]
'go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat'
2.6 去除停止词
python
1 2 3 4
# 去除停止词 ,如a、an、the、高频介词、连词、代词等 stop = stopwords.words('english') data['text'] = data['text'].apply(lambda x: " ".join(x for x in x.split() if x notin stop)) data['text'][0]
'go jurong point crazy available bugis n great world la e buffet cine got amore wat'
2.7 分词处理
python
1 2 3 4 5
# 分词处理,希望能够实现还原英文单词原型 st = PorterStemmer() data['text'] = data['text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()])) data['text'] = data['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) data['text'][0]
'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'
python
1
data.head()
label
text
0
ham
go jurong point crazi avail bugi n great world...
1
ham
ok lar joke wif u oni
2
spam
free entri 2 wkli comp win fa cup final tkt 21...
3
ham
u dun say earli hor u c alreadi say
4
ham
nah think goe usf live around though
3 特征提取
python
1 2
from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences
Using TensorFlow backend.
3.1 分出训练集和测试集
python
1 2
#以 8:2 的比例分出训练集和测试集 train, test = train_test_split(data, test_size=0.2)
# dictionary containing words and their index word_index = tokenizer.word_index
# print(tokenizer.word_index) # total words in the corpus print('Found %s unique tokens.' % len(word_index)) # get only the top frequent words on train
train_x = pad_sequences(train_sequences, maxlen=max_sequence_length) # get only the top frequent words on test test_x = pad_sequences(test_sequences, maxlen=max_sequence_length)
# Import Libraries import sys, os, re, csv, codecs, numpy as np, pandas as pd from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical from keras.layers import Dense, Input, LSTM, Embedding,Dropout, Activation from keras.layers import Bidirectional, GlobalMaxPool1D,Conv1D, SimpleRNN from keras.models import Model from keras.models import Sequential from keras import initializers, regularizers, constraints,optimizers, layers from keras.layers import Dense, Input, Flatten, Dropout,BatchNormalization from keras.layers import Conv1D, MaxPooling1D, Embedding from keras.models import Sequential