数据
访问链接查看大赛要求
https://tianchi.aliyun.com/competition/entrance/531810/introduction
数据描述:
模型(cnn)
代码
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve,auc
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import keras
import numpy as np
train_df = pd.read_csv('train_set.csv', sep='\t')
test_df = pd.read_csv('test_a.csv', sep='\t', nrows=None)
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
train_df['text'].iloc[:].values, train_df['label'].iloc[:].values,
test_size=0.05, random_state=42
)
len(list(pd.Series(y_train).value_counts().index))
14
print(len(X_train))
190000
def chuli_data(datas):
results = []
for rr in datas:
results.append(rr.split())
return results
X_train = chuli_data(X_train[:25000])
X_val = chuli_data(X_val[:3000])
y_train=y_train[:25000]
y_val = y_val[:3000]
X_test=test_df['text'].iloc[:].values
from keras.preprocessing.sequence import pad_sequences
X_test=chuli_data(X_test)
X_tests = pad_sequences(X_test, maxlen=500)
X_train = pad_sequences(X_train, maxlen=500)
X_val = pad_sequences(X_val, maxlen=500)
sequence_length=X_train.shape[1]
vocab_size = np.max([np.max(X_train[i]) for i in range(X_train.shape[0])]) + 1
#文本特征向量化
vocab_size
7550
y_train = keras.utils.to_categorical(y_train,num_classes=14)
y_val = keras.utils.to_categorical(y_val,num_classes=14)
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv1D, MaxPooling1D
from keras import optimizers
model = Sequential()
model.add(Embedding(vocab_size, 64, input_length = 500))#词汇表里词,词向量的维度64,输入序列的长度500
model.add(Conv1D(filters = 64, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(MaxPooling1D(pool_size = 2))
model.add(Conv1D(filters = 128, kernel_size = 3, padding = 'same',activation = 'relu'))
model.add(MaxPooling1D(pool_size = 2))
model.add(Conv1D(filters = 256, kernel_size = 3, padding = 'same',activation = 'relu'))
model.add(MaxPooling1D(pool_size = 2))
model.add(Flatten())
model.add(Dense(512, activation = 'relu'))
model.add(Dense(units=14,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer=optimizers.RMSprop(lr=1e-4,rho=0.9, epsilon=1e-06), metrics = ['accuracy'])
print(model.summary())
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding (Embedding) (None, 500, 64) 483200
_________________________________________________________________
conv1d (Conv1D) (None, 500, 64) 12352
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 250, 64) 0
_________________________________________________________________
conv1d_1 (Conv1D) (None, 250, 128) 24704
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 125, 128) 0
_________________________________________________________________
conv1d_2 (Conv1D) (None, 125, 256) 98560
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 62, 256) 0
_________________________________________________________________
flatten (Flatten) (None, 15872) 0
_________________________________________________________________
dense (Dense) (None, 512) 8126976
_________________________________________________________________
dense_1 (Dense) (None, 14) 7182
=================================================================
Total params: 8,752,974
Trainable params: 8,752,974
Non-trainable params: 0
_________________________________________________________________
None
model.fit(X_train, y_train,epochs = 20, batch_size = 100)
Epoch 1/20
250/250 [==============================] - 86s 319ms/step - loss: 2.1802 - accuracy: 0.2573
Epoch 2/20
250/250 [==============================] - 83s 330ms/step - loss: 1.3863 - accuracy: 0.5673
Epoch 3/20
250/250 [==============================] - 77s 309ms/step - loss: 1.0288 - accuracy: 0.6795
Epoch 4/20
250/250 [==============================] - 79s 317ms/step - loss: 0.8081 - accuracy: 0.7536
Epoch 5/20
250/250 [==============================] - 79s 316ms/step - loss: 0.6772 - accuracy: 0.7920
Epoch 6/20
250/250 [==============================] - 79s 316ms/step - loss: 0.5864 - accuracy: 0.8211
Epoch 7/20
250/250 [==============================] - 78s 310ms/step - loss: 0.5259 - accuracy: 0.8453
Epoch 8/20
250/250 [==============================] - 80s 319ms/step - loss: 0.4837 - accuracy: 0.8605
Epoch 9/20
250/250 [==============================] - 78s 312ms/step - loss: 0.4258 - accuracy: 0.8741
Epoch 10/20
250/250 [==============================] - 101s 404ms/step - loss: 0.3973 - accuracy: 0.8835
Epoch 11/20
250/250 [==============================] - 90s 359ms/step - loss: 0.3526 - accuracy: 0.8953
Epoch 12/20
250/250 [==============================] - 82s 330ms/step - loss: 0.3320 - accuracy: 0.9021
Epoch 13/20
250/250 [==============================] - 80s 319ms/step - loss: 0.2963 - accuracy: 0.9123
Epoch 14/20
250/250 [==============================] - 83s 332ms/step - loss: 0.2703 - accuracy: 0.9213
Epoch 15/20
250/250 [==============================] - 94s 375ms/step - loss: 0.2452 - accuracy: 0.9263
Epoch 16/20
250/250 [==============================] - 90s 360ms/step - loss: 0.2297 - accuracy: 0.9322
Epoch 17/20
250/250 [==============================] - 90s 359ms/step - loss: 0.2058 - accuracy: 0.9390
Epoch 18/20
250/250 [==============================] - 91s 364ms/step - loss: 0.1913 - accuracy: 0.9422
Epoch 19/20
250/250 [==============================] - 93s 371ms/step - loss: 0.1741 - accuracy: 0.9479
Epoch 20/20
250/250 [==============================] - 92s 369ms/step - loss: 0.1504 - accuracy: 0.9590
<tensorflow.python.keras.callbacks.History at 0x176dcc9cb80>
model.evaluate(X_val,y_val,batch_size=100)
30/30 [==============================] - 3s 75ms/step - loss: 0.4943 - accuracy: 0.8620
[0.49429306387901306, 0.8619999885559082]
results = model.predict(X_tests)