Sentiment Analysis

# Load libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical

# Read in training data
reviews = pd.read_csv('reviews.txt', header=None)
labels = pd.read_csv('labels.txt', header=None)

# Count word frequency
from collections import Counter
total_counts = Counter()
for _, row in reviews.iterrows():
    total_counts.update(row[0].split(' '))
print("Total words in data set: ", len(total_counts))

('Total words in data set: ', 74074)

# Sort and print top vocab words
vocab = sorted(total_counts, key=total_counts.get, reverse=True)[:10000]
print(vocab[:60])

['', 'the', '.', 'and', 'a', 'of', 'to', 'is', 'br', 'it', 'in', 'i', 'this', 'that', 's', 'was', 'as', 'for', 'with', 'movie', 'but', 'film', 'you', 'on', 't', 'not', 'he', 'are', 'his', 'have', 'be', 'one', 'all', 'at', 'they', 'by', 'an', 'who', 'so', 'from', 'like', 'there', 'her', 'or', 'just', 'about', 'out', 'if', 'has', 'what', 'some', 'good', 'can', 'more', 'she', 'when', 'very', 'up', 'time', 'no']

# Assign index to each word
word2idx = {word: i for i, word in enumerate(vocab)}

# Convert text to vector
def text_to_vector(text):
    word_vector = np.zeros(len(vocab), dtype=np.int_)
    for word in text.split(' '):
        idx = word2idx.get(word, None)
        if idx is None:
            continue
        else:
            word_vector[idx] += 1
    return np.array(word_vector)

word_vectors = np.zeros((len(reviews), len(vocab)), dtype=np.int_)
for ii, (_, text) in enumerate(reviews.iterrows()):
    word_vectors[ii] = text_to_vector(text[0])

# Split into train, validation and test sets
Y = (labels=='positive').astype(np.int_)
records = len(labels)

shuffle = np.arange(records)
np.random.shuffle(shuffle)
test_fraction = 0.9

train_split, test_split = shuffle[:int(records*test_fraction)], shuffle[int(records*test_fraction):]
trainX, trainY = word_vectors[train_split,:], to_categorical(Y.values[train_split], 2)
testX, testY = word_vectors[test_split,:], to_categorical(Y.values[test_split], 2)

# Network building
def build_model():
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()

    # Inputs
    net = tflearn.input_data([None, 10000])

    # Hidden layer(s)
    net = tflearn.fully_connected(net, 200, activation='ReLU')
    net = tflearn.fully_connected(net, 25, activation='ReLU')

    # Output layer
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='sgd', 
                             learning_rate=0.1, 
                             loss='categorical_crossentropy')

    model = tflearn.DNN(net)
    return model

# Initialize model
model = build_model()

# Training
model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=100)

Training Step: 15899  | total loss: [1m[32m0.21230[0m[0m | time: 4.150s
| SGD | epoch: 100 | loss: 0.21230 - acc: 0.9071 -- iter: 20224/20250
Training Step: 15900  | total loss: [1m[32m0.20658[0m[0m | time: 5.185s
| SGD | epoch: 100 | loss: 0.20658 - acc: 0.9054 | val_loss: 0.38249 - val_acc: 0.8631 -- iter: 20250/20250
--

# Test accuracy
predictions = (np.array(model.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)

('Test accuracy: ', 0.86519999999999997)

# Helper function that uses your model to predict sentiment
def test_sentence(sentence):
    positive_prob = model.predict([text_to_vector(sentence.lower())])[0][1]
    print('Sentence: {}'.format(sentence))
    print('P(positive) = {:.3f} :'.format(positive_prob), 
          'Positive' if positive_prob > 0.5 else 'Negative')

# Test examples
sentence = "this is a great day"
test_sentence(sentence)

sentence = "this is a disappointing experience"
test_sentence(sentence)

sentence = "this is a greatly disappointing experience"
test_sentence(sentence)

sentence = "I am not going to recommend this person"
test_sentence(sentence)

sentence = "I cannot believe how good she was"
test_sentence(sentence)

sentence = "I thought very highly of the person"
test_sentence(sentence)

sentence = "I thought he was not up to the mark"
test_sentence(sentence)

sentence = "The discussion was very enlightening"
test_sentence(sentence)

sentence = "He could have been more prepared"
test_sentence(sentence)

sentence = "His skills in programming were clearly evident"
test_sentence(sentence)

sentence = "Some improvement in communication would have helped"
test_sentence(sentence)

sentence = "She does not have a great deal of experience with databases"
test_sentence(sentence)

sentence = "His software skills were limited"
test_sentence(sentence)

sentence = "His background in scalable systems is promising"
test_sentence(sentence)

sentence = "It was unusual that he did not know about MapReduce"
test_sentence(sentence)

sentence = "It was not evident that he could perform well"
test_sentence(sentence)

sentence = "His coding skills were amazing"
test_sentence(sentence)

sentence = "He did not perform well on the coding test"
test_sentence(sentence)

sentence = "He demonstrated a good understanding of databases"
test_sentence(sentence)

sentence = "The logic he used to explain that problem was not correct"
test_sentence(sentence)

Sentence: this is a great day
('P(positive) = 0.999 :', 'Positive')
Sentence: this is a disappointing experience
('P(positive) = 0.240 :', 'Negative')
Sentence: this is a greatly disappointing experience
('P(positive) = 0.309 :', 'Negative')
Sentence: I am not going to recommend this person
('P(positive) = 0.227 :', 'Negative')
Sentence: I cannot believe how good she was
('P(positive) = 0.615 :', 'Positive')
Sentence: I thought very highly of the person
('P(positive) = 0.991 :', 'Positive')
Sentence: I thought he was not up to the mark
('P(positive) = 0.586 :', 'Positive')
Sentence: The discussion was very enlightening
('P(positive) = 0.681 :', 'Positive')
Sentence: He could have been more prepared
('P(positive) = 0.250 :', 'Negative')
Sentence: His skills in programming were clearly evident
('P(positive) = 0.427 :', 'Negative')
Sentence: Some improvement in communication would have helped
('P(positive) = 0.165 :', 'Negative')
Sentence: She does not have a great deal of experience with databases
('P(positive) = 0.985 :', 'Positive')
Sentence: His software skills were limited
('P(positive) = 0.514 :', 'Positive')
Sentence: His background in scalable systems is promising
('P(positive) = 0.436 :', 'Negative')
Sentence: It was unusual that he did not know about MapReduce
('P(positive) = 0.713 :', 'Positive')
Sentence: It was not evident that he could perform well
('P(positive) = 0.453 :', 'Negative')
Sentence: His coding skills were amazing
('P(positive) = 0.991 :', 'Positive')
Sentence: He did not perform well on the coding test
('P(positive) = 0.731 :', 'Positive')
Sentence: He demonstrated a good understanding of databases
('P(positive) = 0.821 :', 'Positive')
Sentence: The logic he used to explain that problem was not correct
('P(positive) = 0.061 :', 'Negative')

social