Published: Mon 20 February 2017
In Machine Learning .
# Load libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical
# Read in training data
reviews = pd . read_csv ( 'reviews.txt' , header = None )
labels = pd . read_csv ( 'labels.txt' , header = None )
# Count word frequency
from collections import Counter
total_counts = Counter ()
for _ , row in reviews . iterrows ():
total_counts . update ( row [ 0 ] . split ( ' ' ))
print ( "Total words in data set: " , len ( total_counts ))
('Total words in data set: ', 74074)
# Sort and print top vocab words
vocab = sorted ( total_counts , key = total_counts . get , reverse = True )[: 10000 ]
print ( vocab [: 60 ])
['', 'the', '.', 'and', 'a', 'of', 'to', 'is', 'br', 'it', 'in', 'i', 'this', 'that', 's', 'was', 'as', 'for', 'with', 'movie', 'but', 'film', 'you', 'on', 't', 'not', 'he', 'are', 'his', 'have', 'be', 'one', 'all', 'at', 'they', 'by', 'an', 'who', 'so', 'from', 'like', 'there', 'her', 'or', 'just', 'about', 'out', 'if', 'has', 'what', 'some', 'good', 'can', 'more', 'she', 'when', 'very', 'up', 'time', 'no']
# Assign index to each word
word2idx = { word : i for i , word in enumerate ( vocab )}
# Convert text to vector
def text_to_vector ( text ):
word_vector = np . zeros ( len ( vocab ), dtype = np . int_ )
for word in text . split ( ' ' ):
idx = word2idx . get ( word , None )
if idx is None :
continue
else :
word_vector [ idx ] += 1
return np . array ( word_vector )
word_vectors = np . zeros (( len ( reviews ), len ( vocab )), dtype = np . int_ )
for ii , ( _ , text ) in enumerate ( reviews . iterrows ()):
word_vectors [ ii ] = text_to_vector ( text [ 0 ])
# Split into train, validation and test sets
Y = ( labels == 'positive' ) . astype ( np . int_ )
records = len ( labels )
shuffle = np . arange ( records )
np . random . shuffle ( shuffle )
test_fraction = 0.9
train_split , test_split = shuffle [: int ( records * test_fraction )], shuffle [ int ( records * test_fraction ):]
trainX , trainY = word_vectors [ train_split ,:], to_categorical ( Y . values [ train_split ], 2 )
testX , testY = word_vectors [ test_split ,:], to_categorical ( Y . values [ test_split ], 2 )
# Network building
def build_model ():
# This resets all parameters and variables, leave this here
tf . reset_default_graph ()
# Inputs
net = tflearn . input_data ([ None , 10000 ])
# Hidden layer(s)
net = tflearn . fully_connected ( net , 200 , activation = 'ReLU' )
net = tflearn . fully_connected ( net , 25 , activation = 'ReLU' )
# Output layer
net = tflearn . fully_connected ( net , 2 , activation = 'softmax' )
net = tflearn . regression ( net , optimizer = 'sgd' ,
learning_rate = 0.1 ,
loss = 'categorical_crossentropy' )
model = tflearn . DNN ( net )
return model
# Initialize model
model = build_model ()
# Training
model . fit ( trainX , trainY , validation_set = 0.1 , show_metric = True , batch_size = 128 , n_epoch = 100 )
Training Step: 15899 | total loss: [1m[32m0.21230[0m[0m | time: 4.150s
| SGD | epoch: 100 | loss: 0.21230 - acc: 0.9071 -- iter: 20224/20250
Training Step: 15900 | total loss: [1m[32m0.20658[0m[0m | time: 5.185s
| SGD | epoch: 100 | loss: 0.20658 - acc: 0.9054 | val_loss: 0.38249 - val_acc: 0.8631 -- iter: 20250/20250
--
# Test accuracy
predictions = ( np . array ( model . predict ( testX ))[:, 0 ] >= 0.5 ) . astype ( np . int_ )
test_accuracy = np . mean ( predictions == testY [:, 0 ], axis = 0 )
print ( "Test accuracy: " , test_accuracy )
('Test accuracy: ', 0.86519999999999997)
# Helper function that uses your model to predict sentiment
def test_sentence ( sentence ):
positive_prob = model . predict ([ text_to_vector ( sentence . lower ())])[ 0 ][ 1 ]
print ( 'Sentence: {}' . format ( sentence ))
print ( 'P(positive) = {:.3f} :' . format ( positive_prob ),
'Positive' if positive_prob > 0.5 else 'Negative' )
# Test examples
sentence = "this is a great day"
test_sentence ( sentence )
sentence = "this is a disappointing experience"
test_sentence ( sentence )
sentence = "this is a greatly disappointing experience"
test_sentence ( sentence )
sentence = "I am not going to recommend this person"
test_sentence ( sentence )
sentence = "I cannot believe how good she was"
test_sentence ( sentence )
sentence = "I thought very highly of the person"
test_sentence ( sentence )
sentence = "I thought he was not up to the mark"
test_sentence ( sentence )
sentence = "The discussion was very enlightening"
test_sentence ( sentence )
sentence = "He could have been more prepared"
test_sentence ( sentence )
sentence = "His skills in programming were clearly evident"
test_sentence ( sentence )
sentence = "Some improvement in communication would have helped"
test_sentence ( sentence )
sentence = "She does not have a great deal of experience with databases"
test_sentence ( sentence )
sentence = "His software skills were limited"
test_sentence ( sentence )
sentence = "His background in scalable systems is promising"
test_sentence ( sentence )
sentence = "It was unusual that he did not know about MapReduce"
test_sentence ( sentence )
sentence = "It was not evident that he could perform well"
test_sentence ( sentence )
sentence = "His coding skills were amazing"
test_sentence ( sentence )
sentence = "He did not perform well on the coding test"
test_sentence ( sentence )
sentence = "He demonstrated a good understanding of databases"
test_sentence ( sentence )
sentence = "The logic he used to explain that problem was not correct"
test_sentence ( sentence )
Sentence : this is a great day
( 'P(positive) = 0.999 :' , 'Positive' )
Sentence : this is a disappointing experience
( 'P(positive) = 0.240 :' , 'Negative' )
Sentence : this is a greatly disappointing experience
( 'P(positive) = 0.309 :' , 'Negative' )
Sentence : I am not going to recommend this person
( 'P(positive) = 0.227 :' , 'Negative' )
Sentence : I cannot believe how good she was
( 'P(positive) = 0.615 :' , 'Positive' )
Sentence : I thought very highly of the person
( 'P(positive) = 0.991 :' , 'Positive' )
Sentence : I thought he was not up to the mark
( 'P(positive) = 0.586 :' , 'Positive' )
Sentence : The discussion was very enlightening
( 'P(positive) = 0.681 :' , 'Positive' )
Sentence : He could have been more prepared
( 'P(positive) = 0.250 :' , 'Negative' )
Sentence : His skills in programming were clearly evident
( 'P(positive) = 0.427 :' , 'Negative' )
Sentence : Some improvement in communication would have helped
( 'P(positive) = 0.165 :' , 'Negative' )
Sentence : She does not have a great deal of experience with databases
( 'P(positive) = 0.985 :' , 'Positive' )
Sentence : His software skills were limited
( 'P(positive) = 0.514 :' , 'Positive' )
Sentence : His background in scalable systems is promising
( 'P(positive) = 0.436 :' , 'Negative' )
Sentence : It was unusual that he did not know about MapReduce
( 'P(positive) = 0.713 :' , 'Positive' )
Sentence : It was not evident that he could perform well
( 'P(positive) = 0.453 :' , 'Negative' )
Sentence : His coding skills were amazing
( 'P(positive) = 0.991 :' , 'Positive' )
Sentence : He did not perform well on the coding test
( 'P(positive) = 0.731 :' , 'Positive' )
Sentence : He demonstrated a good understanding of databases
( 'P(positive) = 0.821 :' , 'Positive' )
Sentence : The logic he used to explain that problem was not correct
( 'P(positive) = 0.061 :' , 'Negative' )