DEEP LEARNING ON USA DEMOCRATES DEBATE
By Pamela Dekas
import sys
import csv
import re
import nltk
import string
import unicodedata
from textblob import TextBlob
from collections import Counter
import pandas as pd
import numpy as np
from wordcloud import WordCloud
from nltk .classify import *
from nltk .corpus import stopwords
from sklearn .metrics import f1_score , roc_auc_score
from sklearn .feature_extraction .text import CountVectorizer
from nltk .tokenize import word_tokenize
import nltk .classify .util
import matplotlib .pyplot as plt
from string import punctuation
from nltk .corpus import stopwords
from wordcloud import STOPWORDS
import os
from sklearn .model_selection import train_test_split
from keras .datasets import imdb
from keras .models import Sequential
from keras .layers import Dense
from keras .layers import LSTM
from keras .layers .embeddings import Embedding
from keras .preprocessing import sequence , text
from keras .callbacks import EarlyStopping
Using TensorFlow backend.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
in
()
22 import os
23 from sklearn.model_selection import train_test_split
---> 24 from keras.datasets import imdb
25 from keras.models import Sequential
26 from keras.layers import Dense
~\Anaconda3\lib\site-packages\keras\__init__.py in
()
1 from __future__ import absolute_import
2
----> 3 from . import utils
4 from . import activations
5 from . import applications
~\Anaconda3\lib\site-packages\keras\utils\__init__.py in
()
4 from . import data_utils
5 from . import io_utils
----> 6 from . import conv_utils
7 from . import losses_utils
8 from . import metrics_utils
~\Anaconda3\lib\site-packages\keras\utils\conv_utils.py in
() 7 from six.moves import range 8 import numpy as np ----> 9 from .. import backend as K 10 11 ~\Anaconda3\lib\site-packages\keras\backend\__init__.py in
() ----> 1 from .load_backend import epsilon 2 from .load_backend import set_epsilon 3 from .load_backend import floatx 4 from .load_backend import set_floatx 5 from .load_backend import cast_to_floatx ~\Anaconda3\lib\site-packages\keras\backend\load_backend.py in
() 88 elif _BACKEND == 'tensorflow': 89 sys.stderr.write('Using TensorFlow backend.\n') ---> 90 from .tensorflow_backend import * 91 else: 92 # Try and load external backend. ~\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py in
() 52 53 # Private TF Keras utils ---> 54 get_graph = tf_keras_backend.get_graph 55 # learning_phase_scope = tf_keras_backend.learning_phase_scope # TODO 56 name_scope = tf.name_scope AttributeError: module 'tensorflow.python.keras.backend' has no attribute 'get_graph'
speech = pd .read_csv ('debate_transcripts_v3_2020-02-26.csv' ,encoding = 'unicode_escape' )
df = pd .DataFrame (speech )
dem_speakers = df ["speaker" ]
number_of_speakers = len (set (dem_speakers ))
print ("Nombre de speakers:" ,number_of_speakers , "speakers" )
# Mean duration of speech.
print ("temps moyen de parole:" ,np .mean (df ["speaking_time_seconds" ]), "seconds" )
print ("Dataset size:" , len (df ))
Nombre de speakers: 106 speakers
temps moyen de parole: 16.49230769230769 seconds
Dataset size: 5911
RangeIndex: 5911 entries, 0 to 5910
Data columns (total 6 columns):
date 5911 non-null object
debate_name 5911 non-null object
debate_section 5911 non-null object
speaker 5911 non-null object
speech 5911 non-null object
speaking_time_seconds 5395 non-null float64
dtypes: float64(1), object(5)
memory usage: 277.2+ KB
df .groupby ('speaker' )['speaking_time_seconds' ].sum (level = 0 ).nlargest (10 ).plot .bar ()
plt .title ('Repartition par temps de parole' )
plt .show ()
debate_time = df .groupby (by = ['speaker' , 'date' ]).speaking_time_seconds .sum ().nlargest (15 )
debate_time .plot ()
suppresion des colonnes qui ne seront pas utilisé dans la suite du projet et creation du dataset final###
df = df .drop (['date' ,'debate_name' ,'debate_section' ,'speaking_time_seconds' ],1 )
df .head (5 )
speaker
speech
0
Norah O�Donnell
Good evening and welcome, the Democratic presi...
1
Gayle King
And Super Tuesday is just a week away and this...
2
Norah O�Donnell
And CBS News is proud to bring you this debate...
3
Gayle King
And we are partnering tonight also with Twitte...
4
Norah O�Donnell
Now, here are the rules for the next two hours...
PREPROCESSING
import nltk
nltk .download ('punkt' )
stopwords = nltk .corpus .stopwords .words ('english' )
Tailored_stopwords = ('im' ,'ive' ,'mr' ,'weve' ,'dont' ,'well' ,'will' ,'make' ,'us' ,'we' ,
'I' ,'make' ,'got' ,'need' ,'want' ,'think' ,
'going' ,'go' ,'one' ,'thank' ,'going' ,
'way' ,'say' ,'every' ,'re' ,'us' ,'first' ,
'now' ,'said' ,'know' ,'look' ,'done' ,'take' ,
'number' ,'two' ,'three' ,'s' ,'m' ,"t" ,
'let' ,'don' ,'tell' ,'ve' ,'im' ,'mr' ,'put' ,'maybe' ,'whether' ,'many' , 'll' ,'around' ,'thing' ,'Secondly' ,'doesn' ,'lot' )
#stopwords = nltk.corpus.stopwords.words('english')
stopwords = set (STOPWORDS )
stopwords = stopwords .union (Tailored_stopwords )
[nltk_data] Downloading package punkt to C:\Users\pamel.DESKTOP-O19M7N
[nltk_data] F\AppData\Roaming\nltk_data...
[nltk_data] Package punkt is already up-to-date!
def Text_cleansing (speech ):
speech = re .sub ('@[A-Za-z0–9]+' , '' , str (speech ))
speech = re .sub ('#' , '' , speech ) # Enlever les '#' hash tag
speech = re .sub ('rt' , '' , speech )
speech = re .sub (',' ,' ' , speech )
speech = re .sub ('!' ,' ' ,speech )
speech = re .sub (':' ,' ' ,speech )
speech = re .sub ("'" ,"" ,speech )
speech = re .sub ('"' ,'' ,speech )
speech = speech .lower ()
speech = word_tokenize (speech )
return speech
def remove_stopwords (speech ):
speech_clean = [word for word in speech if word not in stopwords ]
return speech_clean
df ['speech_tokens' ]= df ['speech' ].apply (Text_cleansing )
df .head (5 )
speaker
speech
speech_tokens
0
Norah O�Donnell
Good evening and welcome, the Democratic presi...
[good, evening, and, welcome, the, democratic,...
1
Gayle King
And Super Tuesday is just a week away and this...
[and, super, tuesday, is, just, a, week, away,...
2
Norah O�Donnell
And CBS News is proud to bring you this debate...
[and, cbs, news, is, proud, to, bring, you, th...
3
Gayle King
And we are partnering tonight also with Twitte...
[and, we, are, panering, tonight, also, with, ...
4
Norah O�Donnell
Now, here are the rules for the next two hours...
[now, here, are, the, rules, for, the, next, t...
df ['speech_clean' ]= df ['speech_tokens' ].apply (remove_stopwords )
df .head (5 )
speaker
speech
speech_tokens
speech_clean
0
Norah O�Donnell
Good evening and welcome, the Democratic presi...
[good, evening, and, welcome, the, democratic,...
[good, evening, welcome, democratic, president...
1
Gayle King
And Super Tuesday is just a week away and this...
[and, super, tuesday, is, just, a, week, away,...
[super, tuesday, week, away, biggest, primary,...
2
Norah O�Donnell
And CBS News is proud to bring you this debate...
[and, cbs, news, is, proud, to, bring, you, th...
[cbs, news, proud, bring, debate, along, co-sp...
3
Gayle King
And we are partnering tonight also with Twitte...
[and, we, are, panering, tonight, also, with, ...
[panering, tonight, twitter, ., home, paicipat...
4
Norah O�Donnell
Now, here are the rules for the next two hours...
[now, here, are, the, rules, for, the, next, t...
[rules, next, hours, ., asked, question, minut...
def wordcloud (dataframe ):
Aw = df ['speech_clean' ]
wordCloud = WordCloud (width = 500 , height = 300 ,background_color = 'white' , max_font_size = 110 ).generate (str (Aw ))
plt .imshow (wordCloud , interpolation = "bilinear" )
plt .axis ("off" )
plt .title ("speech wordcloud" )
wordcloud (df ['speech_clean' ])
Pour la suite du projet on reduira la liste des speakers aux candidats les plus notoires (top 7 speakers)###
df = df .loc [df .speaker .isin ({'Joe Biden' , 'Bernie Sanders' , 'Elizabeth Warren' , 'Michael Bloomberg' , 'Pete Buttigieg' , 'Amy Klobuchar' , 'Tulsi Gabbard' })]
df .head ()
df .shape
CountVectorizer et creation du dict des mots par candidat a utiliser sur les modeles ML qui seront en back-up###
Analyse Lexicale
cv = CountVectorizer (stop_words = stopwords )
df_cv = cv .fit_transform (df .speech )
df_words = pd .DataFrame (df_cv .toarray (), columns = cv .get_feature_names ())
df_words .index = df .speaker
df_words = df_words .transpose ()
df_words
speaker
Bernie Sanders
Michael Bloomberg
Michael Bloomberg
Bernie Sanders
Pete Buttigieg
Elizabeth Warren
Elizabeth Warren
Pete Buttigieg
Joe Biden
Bernie Sanders
...
Amy Klobuchar
Elizabeth Warren
Amy Klobuchar
Tulsi Gabbard
Tulsi Gabbard
Amy Klobuchar
Amy Klobuchar
Amy Klobuchar
Elizabeth Warren
Elizabeth Warren
00
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
000
2
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
001st
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
01
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
02
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
03
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
04
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
05
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
06
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
07
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
08
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
09
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
10
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
100
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
10000
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
100s
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
10th
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
11
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
12
0
0
0
0
0
0
0
0
1
0
...
0
0
0
0
0
0
0
0
0
0
120
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
125
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
12th
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
13
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
130
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
135
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
137
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
13th
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
14
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
140
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
149
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
xinjiang
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
yachts
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
yale
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
yang
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
yanked
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
ye
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
yeah
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
1
0
0
year
1
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
yearly
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
years
2
0
0
0
0
0
0
1
0
0
...
0
0
0
0
0
0
0
0
1
0
yellow
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
yemen
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
yemin
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
yep
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
yes
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
yesterday
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
yet
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
yo
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
york
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
yorker
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
young
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
younger
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
youngest
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
youth
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
youtube
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
zealand
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
zero
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
zeroed
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
zip
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
zone
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
6385 rows × 2245 columns
top_dict = {}
for c in df_words .columns :
top = df_words [c ].sort_values (ascending = False ).head (30 )
top_dict [c ]= list (zip (top .index , top .values ))
for speaker , top_words in top_dict .items ():
print (speaker )
print (', ' .join ([word for word , count in top_words [0 :9 ]]))
print ('---' )
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
in
()
1 top_dict = {}
2 for c in df_words.columns:
----> 3 top = df_words[c].sort_values(ascending=False).head(30)
4 top_dict[c]= list(zip(top.index, top.values))
5 for speaker, top_words in top_dict.items():
TypeError: sort_values() missing 1 required positional argument: 'by'
df2 = pd .DataFrame (top_dict )
df2 .head (15 )
from collections import Counter
words = []
for speaker in df_words .columns :
top = [word for (word , count ) in top_dict [speaker ]]
for t in top :
words .append (t )
Counter (words ).most_common (15 )
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
in
()
2 words = []
3 for speaker in df_words.columns:
----> 4 top = [word for (word, count) in top_dict[speaker]]
5 for t in top:
6 words.append(t)
KeyError: 'Bernie Sanders'
Implemantation du modèle###
print (df .columns )
print (df .shape )
df ['speaker' ] = df ['speaker' ].astype (str )
Index(['speaker', 'speech', 'speech_tokens', 'speech_clean'], dtype='object')
(2245, 4)
Embedding
RANDOM_STATE = 50
EPOCHS = 5
BATCH_SIZE = 256
EMB_DIM = 100
SAVE_MODEL = True
X = df ['speech_clean' ]
print (X .head ())
X .shape
5 [well, you�re, right, economy, really, great, ...
6 [senator-]
8 [think, donald, trump, thinks, would, better, ...
9 [oh, mr., bloomberg, ., let, tell, mr., putin,...
11 [know, president, russia, wants, it�s, chaos, .]
Name: speech_clean, dtype: object
(2245,)
emb_model = gensim .models .Word2Vec (sentences = X , size = EMB_DIM , window = 5 , workers = 4 , min_count = 1 )
print ('La taille du vocabulaire appris est de ' ,len (list (emb_model .wv .vocab )))
La taille du vocabulaire appris est de 7139
from keras .preprocessing .text import Tokenizer
import tokenize
max_length = max ([len (s ) for s in X ])
tokenizer_new = Tokenizer ()
tokenizer_new .fit_on_texts (X )
X_seq = tokenizer_new .texts_to_sequences (X )
X_fin = sequence .pad_sequences (X_seq , maxlen = max_length )
print (X_fin .shape )
emb_vec = emb_model .wv
MAX_NB_WORDS = len (list (emb_vec .vocab ))
tokenizer_word_index = tokenizer_new .word_index
vocab_size = len (tokenizer_new .word_index ) + 1
embedded_matrix = np .zeros ((vocab_size , EMB_DIM ))
for word , i in tokenizer_word_index .items ():
if i >= MAX_NB_WORDS :
continue
try :
embedding_vector = emb_vec [word ]
wv_matrix [i ] = embedding_vector
except :
pass
embedded_matrix .shape
print (embedded_matrix )
[[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
...
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]]
Préparation des variables
from keras .utils import to_categorical
from sklearn .preprocessing import LabelEncoder
y = df .speaker
print (y .head (10 ))
y .shape
5 1
6 4
8 4
9 1
11 5
12 2
13 2
15 5
21 3
23 1
Name: speaker, dtype: int32
(2245,)
Counter({'Bernie Sanders': 430,
'Michael Bloomberg': 97,
'Pete Buttigieg': 392,
'Elizabeth Warren': 440,
'Joe Biden': 456,
'Amy Klobuchar': 353,
'Tulsi Gabbard': 77})
le = LabelEncoder ()
df ['speaker' ] = le .fit_transform (df ['speaker' ])
df .head ()
y = df .speaker
y .head ()
print (y .shape )
print (X_fin .shape )
X_train , X_test , y_train , y_test = train_test_split (X_fin , y , test_size = 0.2 , random_state = 42 )
print (X_train .shape )
print (y_train .shape )
Construction des NN
model_pre_trained = Sequential ()
model_pre_trained .add (Embedding (vocab_size , EMB_DIM , weights = [embedded_matrix ],
input_length = max_length , trainable = False ))
model_pre_trained .add (LSTM (128 , dropout = 0.2 , recurrent_dropout = 0.2 ))
model_pre_trained .add (Dense (1 , activation = 'softmax' ))
model_pre_trained .compile (loss = 'categorical_crossentropy' ,
optimizer = 'adam' ,
metrics = ['accuracy' ])
model_pre_trained .summary ()
Model: "sequential_11"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_11 (Embedding) (None, 140, 100) 714000
_________________________________________________________________
lstm_13 (LSTM) (None, 128) 117248
_________________________________________________________________
dense_9 (Dense) (None, 1) 129
=================================================================
Total params: 831,377
Trainable params: 117,377
Non-trainable params: 714,000
_________________________________________________________________
Fitting
history_pre_trained = model_pre_trained .fit (X_fin , y , batch_size = BATCH_SIZE , epochs = 20 , verbose = 1 , validation_split = 0.2 )
Train on 1796 samples, validate on 449 samples
Epoch 1/20
1796/1796 [==============================] - 4s 2ms/step - loss: 0.5429 - accuracy: 0.1754 - val_loss: -0.4417 - val_accuracy: 0.2472
Epoch 2/20
1796/1796 [==============================] - 3s 2ms/step - loss: -6.7429 - accuracy: 0.1776 - val_loss: -14.1017 - val_accuracy: 0.2472
Epoch 3/20
1796/1796 [==============================] - 3s 2ms/step - loss: -15.8550 - accuracy: 0.1776 - val_loss: -19.5441 - val_accuracy: 0.2472
Epoch 4/20
1796/1796 [==============================] - 3s 2ms/step - loss: -20.7949 - accuracy: 0.1776 - val_loss: -23.4335 - val_accuracy: 0.2472
Epoch 5/20
1796/1796 [==============================] - 3s 2ms/step - loss: -24.1430 - accuracy: 0.1776 - val_loss: -25.9735 - val_accuracy: 0.2472
Epoch 6/20
1796/1796 [==============================] - 3s 2ms/step - loss: -26.4535 - accuracy: 0.1776 - val_loss: -28.0725 - val_accuracy: 0.2472
Epoch 7/20
1796/1796 [==============================] - 3s 2ms/step - loss: -28.4266 - accuracy: 0.1776 - val_loss: -29.9313 - val_accuracy: 0.2472
Epoch 8/20
1796/1796 [==============================] - 3s 2ms/step - loss: -30.1754 - accuracy: 0.1776 - val_loss: -31.6261 - val_accuracy: 0.2472
Epoch 9/20
1796/1796 [==============================] - 3s 2ms/step - loss: -31.8791 - accuracy: 0.1776 - val_loss: -33.3337 - val_accuracy: 0.2472
Epoch 10/20
1796/1796 [==============================] - 4s 2ms/step - loss: -33.5166 - accuracy: 0.1776 - val_loss: -34.9834 - val_accuracy: 0.2472
Epoch 11/20
1796/1796 [==============================] - 3s 2ms/step - loss: -35.1544 - accuracy: 0.1776 - val_loss: -36.5973 - val_accuracy: 0.2472
Epoch 12/20
1796/1796 [==============================] - 3s 2ms/step - loss: -36.7253 - accuracy: 0.1776 - val_loss: -38.2070 - val_accuracy: 0.2472
Epoch 13/20
1796/1796 [==============================] - 3s 2ms/step - loss: -38.3344 - accuracy: 0.1776 - val_loss: -39.8655 - val_accuracy: 0.2472
Epoch 14/20
1796/1796 [==============================] - 3s 2ms/step - loss: -39.9810 - accuracy: 0.1776 - val_loss: -41.5162 - val_accuracy: 0.2472
Epoch 15/20
1796/1796 [==============================] - 3s 1ms/step - loss: -41.6567 - accuracy: 0.1776 - val_loss: -43.2049 - val_accuracy: 0.2472
Epoch 16/20
1796/1796 [==============================] - 3s 1ms/step - loss: -43.2579 - accuracy: 0.1776 - val_loss: -44.8235 - val_accuracy: 0.2472
Epoch 17/20
1796/1796 [==============================] - 3s 1ms/step - loss: -44.9030 - accuracy: 0.1776 - val_loss: -46.4982 - val_accuracy: 0.2472
Epoch 18/20
1796/1796 [==============================] - 2s 1ms/step - loss: -46.5038 - accuracy: 0.1776 - val_loss: -48.0627 - val_accuracy: 0.2472
Epoch 19/20
1796/1796 [==============================] - 3s 1ms/step - loss: -48.0124 - accuracy: 0.1776 - val_loss: -49.5424 - val_accuracy: 0.2472
Epoch 20/20
1796/1796 [==============================] - 2s 1ms/step - loss: -49.5209 - accuracy: 0.1776 - val_loss: -51.1489 - val_accuracy: 0.2472
Evaluation du modèle
score = model_pre_trained .evaluate (X_test , y_test , verbose = 0 )
print ('Test loss:' , score [0 ])
print ('Test accuracy:' , score [1 ])
Test loss: -51.148848297866785
Test accuracy: 0.18930958211421967
ptoblèmes: npmbre important de stopwords à rajouter au dictionnaire, doutes sur la fonction dactivation, stemming/lemmatization qui semble peu efficace; axes d'amélioration: explorer les N grammes pouir contextualiser les mots et creer u_n dictionnaire de stopwords customisé pour les deabts ( association d'idées)/