Natural Language Processing

Install nltk
conda install -c anaconda nltk

Data Set: Restaurant_Reviews.tsv (Tab Separated File)

Import Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

Import Data Set

os.chdir('C:\\Noble\\Training\\Deep Learning\\Training\Data\\')
# \t – for tab separated
# quoting = 3 – ignore “” from processing
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

Get one row from data set – example line 5


To Print / View all stop words

import nltk # for stop words
from nltk.corpus import stopwords'stopwords')
all_stopwords = stopwords.words('english')
print (all_stopwords)

Cleaning the Data Set

import re
# re – Regular expression -
import nltk # for stop words'stopwords') # importing all stopwords
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # For applying steming in the
dataset , to get the root of the word
corpus = [] # create a list to store all cleaned words
for i in range(0, 1000):
# dataset['Review'][i] - source data to prcess - i th record in the data
review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) # Replace punctuations
with space, other than letters replace with space
review = review.lower()
review = review.split() # split into different words
ps = PorterStemmer() # get root words
all_stopwords = stopwords.words('english') # get english stop words
all_stopwords.remove('not') # Remove “not” from stop words
review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
review = ' '.join(review)

Print Corpus
print (corpus)

To check Number of Distinct Words

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer() # 1500 is decided by statement len(X[0]). Fist execute
without max features
X = cv.fit_transform(corpus).toarray()

Create a Bag of Words (tokenization)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500) # 1500 is decided by statement
len(X[0]). Fist execute without max features
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values # this is dependent variable
print(len(X[0])) # this gives me the max_features count
print (X)
print (y)

Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20,
random_state = 0)

Print Size
print (X.shape)
print (X_train.shape)
print (X_test.shape)

Create Naïve Bayce Algorithms

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB(), y_train)

y_pred = classifier.predict(X_test)
Print Result Actual and Predict

Confusion Matrix to print Accuracy

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
accuracy_score(y_test, y_pred)

Create Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier

dt= DecisionTreeClassifier(), y_train)
dt_pred = dt.predict(X_test)
cm = confusion_matrix(y_test,dt_pred)

