Nothing Special   »   [go: up one dir, main page]

Natural Language Processing

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 5

Natural Language Processing

Install nltk
conda install -c anaconda nltk

Data Set: Restaurant_Reviews.tsv (Tab Separated File)

Import Libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

Import Data Set


os.chdir('C:\\Noble\\Training\\Deep Learning\\Training\Data\\')
os.getcwd()
# \t – for tab separated
# quoting = 3 – ignore “” from processing
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
dataset

Get one row from data set – example line 5


dataset['Review'][5]

To Print / View all stop words


import nltk # for stop words
from nltk.corpus import stopwords
nltk.download('stopwords')
all_stopwords = stopwords.words('english')
print (all_stopwords)

Cleaning the Data Set


import re
# re – Regular expression - https://docs.python.org/3/library/re.html
import nltk # for stop words
nltk.download('stopwords') # importing all stopwords
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # For applying steming in the
dataset , to get the root of the word
corpus = [] # create a list to store all cleaned words
for i in range(0, 1000):
# dataset['Review'][i] - source data to prcess - i th record in the data
review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) # Replace punctuations
with space, other than letters replace with space
review = review.lower()
review = review.split() # split into different words
ps = PorterStemmer() # get root words
all_stopwords = stopwords.words('english') # get english stop words
all_stopwords.remove('not') # Remove “not” from stop words
review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
review = ' '.join(review)
corpus.append(review)

Print Corpus
print (corpus)

To check Number of Distinct Words


from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer() # 1500 is decided by statement len(X[0]). Fist execute
without max features
X = cv.fit_transform(corpus).toarray()
len(X[0])

Create a Bag of Words (tokenization)


from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500) # 1500 is decided by statement
len(X[0]). Fist execute without max features
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values # this is dependent variable
print(len(X[0])) # this gives me the max_features count
print (X)
print (y)

Train Test Split


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20,
random_state = 0)

Print Size
print (X.shape)
print (X_train.shape)
print (X_test.shape)

Create Naïve Bayce Algorithms


from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

Prediction
y_pred = classifier.predict(X_test)
Print Result Actual and Predict
print(np.concatenate((y_pred.reshape(len(y_pred),1),
y_test.reshape(len(y_test),1)),1))

Confusion Matrix to print Accuracy


from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

Create Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier


dt= DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
cm = confusion_matrix(y_test,dt_pred)
print(cm)
accuracy_score(y_test,dt_pred)

You might also like