Nothing Special   »   [go: up one dir, main page]

Section 6 - Jupyter Notebook

Download as pdf or txt
Download as pdf or txt
You are on page 1of 11

In [1]: # This Python 3 environment comes with many helpful analytics libraries installed

# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python


# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
 

In [2]: pip install nltk


Requirement already satisfied: nltk in c:\anaconda\lib\site-packages (3.8.1)


Requirement already satisfied: click in c:\anaconda\lib\site-packages (from nltk) (8.0.4)
Requirement already satisfied: joblib in c:\anaconda\lib\site-packages (from nltk) (1.2.0)
Requirement already satisfied: regex>=2021.8.3 in c:\anaconda\lib\site-packages (from nltk) (2022.7.9)
Requirement already satisfied: tqdm in c:\anaconda\lib\site-packages (from nltk) (4.65.0)
Requirement already satisfied: colorama in c:\anaconda\lib\site-packages (from click->nltk) (0.4.6)
Note: you may need to restart the kernel to use updated packages.

In [4]: import nltk

In [7]: pip install wordcloud


#‫كالود بيتخزن فيه الكلمات االكثر شيوعا وتكرارا ويبرزها بشكل أفضل‬

Requirement already satisfied: wordcloud in c:\anaconda\lib\site-packages (1.9.3)


Requirement already satisfied: numpy>=1.6.1 in c:\anaconda\lib\site-packages (from wordcloud) (1.24.3)
Requirement already satisfied: pillow in c:\anaconda\lib\site-packages (from wordcloud) (9.4.0)
Requirement already satisfied: matplotlib in c:\anaconda\lib\site-packages (from wordcloud) (3.7.2)
Requirement already satisfied: contourpy>=1.0.1 in c:\anaconda\lib\site-packages (from matplotlib->wordcloud) (1.0.5)
Requirement already satisfied: cycler>=0.10 in c:\anaconda\lib\site-packages (from matplotlib->wordcloud) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\anaconda\lib\site-packages (from matplotlib->wordcloud) (4.25.
0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\anaconda\lib\site-packages (from matplotlib->wordcloud) (1.4.
4)
Requirement already satisfied: packaging>=20.0 in c:\anaconda\lib\site-packages (from matplotlib->wordcloud) (23.1)
Requirement already satisfied: pyparsing<3.1,>=2.3.1 in c:\anaconda\lib\site-packages (from matplotlib->wordcloud)
(3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in c:\anaconda\lib\site-packages (from matplotlib->wordcloud) (2.
8.2)
Requirement already satisfied: six>=1.5 in c:\anaconda\lib\site-packages (from python-dateutil>=2.7->matplotlib->word
cloud) (1.16.0)
Note: you may need to restart the kernel to use updated packages.

In [8]: import matplotlib.pyplot as plt


import plotly.express as px
from wordcloud import WordCloud
import nltk
import re
import string
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = stopwords.words()

[nltk_data] Downloading package punkt to


[nltk_data] C:\Users\KTS\AppData\Roaming\nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data] C:\Users\KTS\AppData\Roaming\nltk_data...
[nltk_data] Package stopwords is already up-to-date!
In [13]: # 4.2. Import the data:
df=pd.read_csv('IMDB Dataset.csv',nrows=2000)
df.head()

Out[13]:
review sentiment

0 One of the other reviewers has mentioned that ... positive

1 A wonderful little production. <br /><br />The... positive

2 I thought this was a wonderful way to spend ti... positive

3 Basically there's a family where a little boy ... negative

4 Petter Mattei's "Love in the Time of Money" is... positive

In [14]: df.info()
#‫يقدم معلومات عن ال‬data frame

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 review 2000 non-null object
1 sentiment 2000 non-null object
dtypes: object(2)
memory usage: 31.4+ KB

In [15]: df.describe().T
#‫يقدم احصائية عن البيانات الموجودة‬
#‫العدد اإلجمالي للقيم في العمود‬.
#‫المتوسط الحسابي للعمر‬
#‫االنحراف المعياري للعمر‬
#‫القيمة الدنيا للعمر‬
#‫القيمة القصوى للعمر‬
#‫الربع األول والثاني والثالث للعمر‬

Out[15]:
count unique top freq

review 2000 2000 One of the other reviewers has mentioned that ... 1

sentiment 2000 2 positive 1005

In [16]: #sentiment count:


df['sentiment'].value_counts()
#‫تقوم بعد القيم الموجودة في عمود معين وتعد كم مرة تظهر كل قيمة‬.
#‫ تقييم سلبي في البيانات‬995 ‫ تقييم إيجابي و‬1005 ‫فهذا يعني أن هناك‬.

Out[16]: sentiment
positive 1005
negative 995
Name: count, dtype: int64

In [10]: df['review'].str.len().hist()
#‫يقوم برسم هيستوغرام لطول التقييمات(عدد األحرف) في العمود‬

Out[10]: <Axes: >


In [17]: fig,(ax1,ax2)=plt.subplots(1,2,figsize=(12,8))
ax1.hist(df[df['sentiment']=='positive']['review'].str.len())
ax1.set_title( 'Positive Reviews')
ax2.hist(df[df['sentiment']=='negative']['review'].str.len())
ax2.set_title( 'Negative Reviews')

#‫هذا الكود ينشئ رسمين بيانيين جانبيين لعرض توزيع طول التقييمات للتقييمات اإليجابية والسلبية بشكل منفصل‬
#‫يتم ذلك عن طريق رسم هيستوغرام لطول التقييمات في كل فئة (إيجابي أو سلبي) على التوالي‬
#‫هذا يسمح بفهم الفروق في طول التقييمات بين التقييمات اإليجابية والسلبية بشكل سريع ومقارنتها بسهولة‬

Out[17]: Text(0.5, 1.0, 'Negative Reviews')

In [18]: df.rename(columns={'review':'text'}, inplace = True)


df
#‫ يثوم بتغير اسم العمود الي‬Text
#‫ثم يقوم بعرض المحتوي الذي فيه‬

Out[18]:
text sentiment

0 One of the other reviewers has mentioned that ... positive

1 A wonderful little production. <br /><br />The... positive

2 I thought this was a wonderful way to spend ti... positive

3 Basically there's a family where a little boy ... negative

4 Petter Mattei's "Love in the Time of Money" is... positive

... ... ...

1995 Feeling Minnesota, directed by Steven Baigelma... negative

1996 THE CELL (2000) Rating: 8/10<br /><br />The Ce... positive

1997 This movie, despite its list of B, C, and D li... negative

1998 I loved this movie! It was all I could do not ... positive

1999 This was the worst movie I have ever seen Bill... negative

2000 rows × 2 columns


In [19]: def cleaning(text):
# converting to lowercase, removing URL links, special characters, punctuations...
text = text.lower() # converting to lowercase
text = re.sub('https?://\S+|www\.\S+', '', text) # removing URL links
text = re.sub(r"\b\d+\b", "", text) # removing number
text = re.sub('<.*?>+', '', text) # removing special characters,
text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # punctuations
text = re.sub('\n', '', text)
text = re.sub('[’“”…]', '', text)

#removing emoji:
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
text = emoji_pattern.sub(r'', text)

# removing short form:

text=re.sub("isn't",'is not',text)
text=re.sub("he's",'he is',text)
text=re.sub("wasn't",'was not',text)
text=re.sub("there's",'there is',text)
text=re.sub("couldn't",'could not',text)
text=re.sub("won't",'will not',text)
text=re.sub("they're",'they are',text)
text=re.sub("she's",'she is',text)
text=re.sub("There's",'there is',text)
text=re.sub("wouldn't",'would not',text)
text=re.sub("haven't",'have not',text)
text=re.sub("That's",'That is',text)
text=re.sub("you've",'you have',text)
text=re.sub("He's",'He is',text)
text=re.sub("what's",'what is',text)
text=re.sub("weren't",'were not',text)
text=re.sub("we're",'we are',text)
text=re.sub("hasn't",'has not',text)
text=re.sub("you'd",'you would',text)
text=re.sub("shouldn't",'should not',text)
text=re.sub("let's",'let us',text)
text=re.sub("they've",'they have',text)
text=re.sub("You'll",'You will',text)
text=re.sub("i'm",'i am',text)
text=re.sub("we've",'we have',text)
text=re.sub("it's",'it is',text)
text=re.sub("don't",'do not',text)
text=re.sub("that´s",'that is',text)
text=re.sub("I´m",'I am',text)
text=re.sub("it’s",'it is',text)
text=re.sub("she´s",'she is',text)
text=re.sub("he’s'",'he is',text)
text=re.sub('I’m','I am',text)
text=re.sub('I’d','I did',text)
text=re.sub("he’s'",'he is',text)
text=re.sub('there’s','there is',text)

return text

dt = df['text'].apply(cleaning)

In [16]: df['sentiment']
#‫" يستخدم السترجاع العمود المسمى‬sentiment" ‫ من‬DataFrame
#‫ سواء كانت إيجابية أو سلبية‬،‫العمود يحتوي على معلومات المشاعر المرتبطة بالتقييمات‬

Out[16]: 0 positive
1 positive
2 positive
3 negative
4 positive
...
1995 negative
1996 positive
1997 negative
1998 positive
1999 negative
Name: sentiment, Length: 2000, dtype: object
In [20]: dt = pd.DataFrame(dt)
dt['sentiment']=df['sentiment']
dt

Out[20]:
text sentiment

0 one of the other reviewers has mentioned that ... positive

1 a wonderful little production the filming tech... positive

2 i thought this was a wonderful way to spend ti... positive

3 basically theres a family where a little boy j... negative

4 petter matteis love in the time of money is a ... positive

... ... ...

1995 feeling minnesota directed by steven baigelman... negative

1996 the cell rating the cell like antz must be wa... positive

1997 this movie despite its list of b c and d list ... negative

1998 i loved this movie it was all i could do not t... positive

1999 this was the worst movie i have ever seen bill... negative

2000 rows × 2 columns

In [21]: # remove stop word:


dt['no_sw'] = dt['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [19]: dt

Out[19]:
text sentiment no_sw

0 one of the other reviewers has mentioned that ... positive reviewers mentioned watching oz episode youll ...

1 a wonderful little production the filming tech... positive wonderful production filming technique unassum...

2 i thought this was a wonderful way to spend ti... positive wonderful spend time hot summer weekend sittin...

3 basically theres a family where a little boy j... negative basically family boy jake thinks zombie closet...

4 petter matteis love in the time of money is a ... positive petter matteis love time money visually stunni...

... ... ... ...

1995 feeling minnesota directed by steven baigelman... negative feeling minnesota directed steven baigelmann s...

1996 the cell rating the cell like antz must be wa... positive cell rating cell antz watched appreciated time...

1997 this movie despite its list of b c and d list ... negative movie despite list b list celebs complete wast...

1998 i loved this movie it was all i could do not t... positive loved movie break tears watching uplifting str...

1999 this was the worst movie i have ever seen bill... negative worst movie billy zane understand movie showca...

2000 rows × 3 columns


In [22]: #Working with the most Frequent Words:
from collections import Counter
cnt = Counter()
for text in dt["no_sw"].values:
for word in text.split():
cnt[word] += 1
cnt.most_common(10)
temp = pd.DataFrame(cnt.most_common(10))
temp.columns=['word', 'count']
temp
#‫ يتم إنشاء متغير‬cnt ‫ من نوع‬Counter ‫لتخزين تكرار الكلمات‬.
#‫يتم استخدام حلقتين متداخلتين لتفريق كل نص إلى كلماته وحساب تكرار كل كلمة‬.
#cnt.most_common(10) ‫ كلمات تكراًرا وترتيبها حسب التكرار‬10 ‫ُيعيد أكثر‬.
#‫ يتم إنشاء‬DataFrame ‫ جديد ُيسمى‬temp ‫يحتوي على قائمة الكلمات األكثر تكراًرا مع عددها‬.
#‫" ُيعين أسماء األعمدة‬word" ‫" و‬count" ‫ل‬DataFrame temp.
#‫ هيطبع‬،‫وبمجرد تنفيذ هذا الكود‬
#DataFrame temp
#‫الكلمات العشر األكثر تكراًرا في النصوص المنظفة مع عددها‬

Out[22]:
word count

0 movie 3376

1 film 2889

2 story 862

3 time 826

4 movies 667

5 great 661

6 made 616

7 make 600

8 characters 586

9 films 566
In [23]: px.bar(temp, x="count", y="word", title='Commmon Words in Text', orientation='h',
width=700, height=700)
#‫ سيتم رسم شريطي يعرض الكلمات األكثر تكراًرا في النصوص المنظفة‬،‫بعد تنفيذ هذا الكود‬،
#‫حيث يتم عرض عدد تكرار كل كلمة على المحور األفقي والكلمة نفسها على المحور العمودي‬.

Commmon Words in Text

films

characters

make

made

great
word

movies

time

story

film

movie

0 500 1000 1500 2000 2500 3000 3500

count

In [24]: # Remove the most frequent words:


FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
"""custom function to remove the frequent words"""
return " ".join([word for word in str(text).split() if word not in FREQWORDS])
dt["wo_stopfreq"] = dt["no_sw"].apply(lambda text: remove_freqwords(text))
dt.head()
#‫الكود يقوم بإزالة الكلمات األكثر تكرارًا من النصوص المنظفة لتحسين جودة التحليل النصي‬

Out[24]:
text sentiment no_sw wo_stopfreq

one of the other reviewers has mentioned that reviewers mentioned watching oz episode youll
0 positive reviewers mentioned watching oz episode youll ...
... ...

wonderful production filming technique wonderful production filming technique


1 a wonderful little production the filming tech... positive
unassum... unassum...

wonderful spend time hot summer weekend wonderful spend hot summer weekend sitting
2 i thought this was a wonderful way to spend ti... positive
sittin... air...

3 basically theres a family where a little boy j... negative basically family boy jake thinks zombie closet... basically family boy jake thinks zombie closet...

4 petter matteis love in the time of money is a ... positive petter matteis love time money visually stunni... petter matteis love money visually stunning wa...

In [25]: dt['no_sw'].loc[5]
#‫ في العمود‬5 ‫" هذا الكود ُيظهر النص المنظف من الكلمات الوقفية للصف رقم‬no_sw" ‫ من‬DataFrame dt

Out[25]: 'probably alltime favorite movie story selflessness sacrifice dedication noble preachy boring despite times years pau
l lukas performance brings tears eyes bette davis sympathetic roles delight kids grandma dressedup midgets children m
akes fun watch mothers slow awakening whats happening world roof believable startling dozen thumbs theyd movie'

In [26]: dt['wo_stopfreq'].loc[5]
#‫ في العمود‬5 ‫" هذا الكود ُيظهر النص الذي تم إزالة منه الكلمات الشائعة والوقفية للصف رقم‬wo_stopfreq" ‫ من‬DataFrame dt.

Out[26]: 'probably alltime favorite selflessness sacrifice dedication noble preachy boring despite times years paul lukas perf
ormance brings tears eyes bette davis sympathetic roles delight kids grandma dressedup midgets children makes fun wat
ch mothers slow awakening whats happening world roof believable startling dozen thumbs theyd'
In [27]: # Lemmatization: Lemmatization is converting the word to its base form or lemma by removing affixes from the inflected
# It helps to create better features for machine learning and NLP models hence it is an important preprocessing step.

wordnet_lem = WordNetLemmatizer()

dt['wo_stopfreq_lem'] = dt['wo_stopfreq'].apply(wordnet_lem.lemmatize)
dt

#wordnet_lem = WordNetLemmatizer() ‫ يقوم بتهيئة محرك‬Lemmatization.

#dt['wo_stopfreq'].apply(wordnet_lem.lemmatize) ‫يطبق عملية‬
#Lemmatization ‫" على كل نص في العمود‬wo_stopfreq" ‫ من‬DataFrame dt.

# "wo_stopfreq_lem" ‫ في‬DataFrame dt.
#‫النتائج يتم تخزينها في عمود جديد باسم‬

# Lemmatization ‫عملية‬
#‫مفيدة لتحسين معالم النصوص وتقليل التباين بين الكلمات المشابهة مما يساعد في تحسين أداء نماذج التعلم اآللي‬
#‫وموديالت معالجة اللغة الطبيعية‬

Out[27]:
text sentiment no_sw wo_stopfreq wo_stopfreq_lem

one of the other reviewers has reviewers mentioned watching oz reviewers mentioned watching oz reviewers mentioned watching oz
0 positive
mentioned that ... episode youll ... episode youll ... episode youll ...

a wonderful little production the wonderful production filming wonderful production filming wonderful production filming
1 positive
filming tech... technique unassum... technique unassum... technique unassum...

i thought this was a wonderful wonderful spend time hot summer wonderful spend hot summer wonderful spend hot summer
2 positive
way to spend ti... weekend sittin... weekend sitting air... weekend sitting air...

basically theres a family where a basically family boy jake thinks basically family boy jake thinks basically family boy jake thinks
3 negative
little boy j... zombie closet... zombie closet... zombie closet...

petter matteis love in the time of petter matteis love time money petter matteis love money visually petter matteis love money visually
4 positive
money is a ... visually stunni... stunning wa... stunning wa...

... ... ... ... ... ...

feeling minnesota directed by feeling minnesota directed steven feeling minnesota directed steven feeling minnesota directed steven
1995 negative
steven baigelman... baigelmann s... baigelmann s... baigelmann s...

the cell rating the cell like antz cell rating cell antz watched cell rating cell antz watched cell rating cell antz watched
1996 positive
must be wa... appreciated time... appreciated medi... appreciated medi...

this movie despite its list of b c movie despite list b list celebs despite list b list celebs complete despite list b list celebs complete
1997 negative
and d list ... complete wast... waste minu... waste minu...

i loved this movie it was all i could loved movie break tears watching loved break tears watching loved break tears watching uplifting
1998 positive
do not t... uplifting str... uplifting struck pe... struck pe...

this was the worst movie i have worst movie billy zane understand worst billy zane understand worst billy zane understand
1999 negative
ever seen bill... movie showca... showcase comers pr... showcase comers pr...

2000 rows × 5 columns

In [28]: # create the cleaned data for the train-test split:


#‫ هذا الكود يقوم بإنشاء‬DataFrame ‫جديد يحتوي على األعمدة التي تم االحتفاظ بها من‬
#‫" باستثناء األعمدة‬text" ‫" و‬no_sw" ‫" و‬wo_stopfreq"
nb=dt.drop(columns=['text','no_sw', 'wo_stopfreq'])
#‫" يتم حذف األعمدة‬text" ‫" و‬no_sw" ‫" و‬wo_stopfreq" ‫ من‬DataFrame dt.
nb.columns=['sentiment','review']
# ‫ يتم إعادة تسمية األعمدة المتبقية في‬DataFrame ‫ الجديد‬nb ‫" إلى‬sentiment" ‫" و‬review".
nb.sentiment = [0 if each == "negative" else 1 for each in nb.sentiment]
#‫" ُيعيد تسمية القيم في عمود‬sentiment" ‫ إذا كانت إيجابية‬1 ‫ وتكون‬،‫ إذا كانت القيمة سلبية‬0 ‫لتكون‬.
nb
#‫بهذا‬، DataFrame nb ‫ األول‬،‫" الجديد يحتوي على عمودين‬sentiment" )‫ لإليجابية‬1 ‫ للسلبية و‬0( ‫الذي يحتوي على تصنيفات المشاعر‬،
#‫" والثاني‬review" ‫الذي يحتوي على النصوص المنظفة والمعالجة‬.

Out[28]:
sentiment review

0 1 reviewers mentioned watching oz episode youll ...

1 1 wonderful production filming technique unassum...

2 1 wonderful spend hot summer weekend sitting air...

3 0 basically family boy jake thinks zombie closet...

4 1 petter matteis love money visually stunning wa...

... ... ...

1995 0 feeling minnesota directed steven baigelmann s...

1996 1 cell rating cell antz watched appreciated medi...

1997 0 despite list b list celebs complete waste minu...

1998 1 loved break tears watching uplifting struck pe...

1999 0 worst billy zane understand showcase comers pr...

2000 rows × 2 columns


In [29]: #‫( هذا الكود يقوم بتطبيق عملية الفصل إلى عناصر‬Tokenization) ‫" على النصوص في عمود‬review"
#‫" عملية الفصل إلى عناصر تقوم بتقسيم النص إلى عناصر فردية تعرف باسم‬tokens" ‫" أو‬tokensized words".
tokenized_review=nb['review'].apply(lambda x: x.split())
#‫ يتم تطبيق الدالة‬split() ‫" على كل نص في العمود‬review"
#‫تقوم هذه الدالة بتقسيم النص إلى كلمات فردية بناًء على المسافات بينها‬
tokenized_review.head(5)
#‫ النتيجة هي‬Series ‫جديدة تحتوي على قائمة من الكلمات المفصولة لكل نص‬

Out[29]: 0 [reviewers, mentioned, watching, oz, episode, ...


1 [wonderful, production, filming, technique, un...
2 [wonderful, spend, hot, summer, weekend, sitti...
3 [basically, family, boy, jake, thinks, zombie,...
4 [petter, matteis, love, money, visually, stunn...
Name: review, dtype: object

In [1]: from sklearn.feature_extraction.text import CountVectorizer


#‫ يقوم بتحميل األداة‬CountVectorizer ‫التي تساعد في تحويل النصوص إلى مصفوفة من األعداد‬.
from nltk.tokenize import RegexpTokenizer
#‫يستخدم لتعريف المفردات المستخدمة لتقسيم النصوص إلى كلمات‬.
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
#‫( يعرف مفردات التقسيم باستخدام تعبيرات منتظمة‬Regular Expressions) ‫الستخراج الكلمات من النصوص‬
#‫ يتم استخدام تعبير منتظم يختار الكلمات التي تحتوي على األحرف األبجدية واألرقام‬،‫في هذه الحالة‬.
cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
#‫ يقوم بإنشاء مثيل لـ‬CountVectorizer ‫مع تحديد بعض الخصائص مثل‬:
#stop_words='english': ‫يقوم بإزالة الكلمات الوقفية اإلنجليزية‬
#‫يستخدم مفردات التقسيم التي تم تعريفها مسبًق ا لتقسيم النصوص‬
text_counts = cv.fit_transform(nb['review'])
#‫ يتم استخدام‬CountVectorizer ‫" لتحويل النصوص في عمود‬review"
#‫ثم إلى مصفوفة من األعداد التي تمثل تواتر ظهور كل كلمة في كل نص‬.

---------------------------------------------------------------------------
NameError Traceback (most recent call last)
Cell In[1], line 12
8 cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
9 #‫ يقوم بإنشاء مثيل لـ‬CountVectorizer ‫مع تحديد بعض الخصائص مثل‬:
10 #stop_words='english': ‫يقوم بإزالة الكلمات الوقفية اإلنجليزية‬
11 #‫يستخدم مفردات التقسيم التي تم تعريفها مسبًقا لتقسيم النصوص‬
---> 12 text_counts = cv.fit_transform(nb['review'])

NameError: name 'nb' is not defined

In [51]: from sklearn.model_selection import train_test_split


X=text_counts
y=nb['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=30)

# ‫الكود ده يقسم البيانات إلى مجموعات للتدريب واالختبار بحيث يتم تخزين المتغيرات التي سيتم تدريب النموذج عليها في‬
#X_train ‫ و‬y_train
#‫ والبيانات التي سيتم اختبار النموذج عليها في‬X_test ‫ و‬y_test.
In [38]: from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import classification_report, confusion_matrix
CNB = ComplementNB()
CNB.fit(X_train, y_train)

from sklearn import metrics
predicted = CNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, y_test)

print('ComplementNB model accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, predicted))
#‫ تصنيف النصوص بناًء على البيانات التي تم تحويلها إلى مصفوفة بواسطة‬CountVectorizer.
#‫النموذج يتم تدريبه باستخدام بيانات التدريب ومن ثم يتم اختبار أدائه باستخدام بيانات االختبار‬.
#‫يتم طباعة دقة النموذج باإلضافة إلى مصفوفة االلتباس وتقرير التصنيف لتقييم أداء النموذج‬.

ComplementNB model accuracy is 80.00%


------------------------------------------------
Confusion Matrix:
0 1
0 160 38
1 42 160
------------------------------------------------
Classification Report:
precision recall f1-score support

0 0.79 0.81 0.80 198


1 0.81 0.79 0.80 202

accuracy 0.80 400


macro avg 0.80 0.80 0.80 400
weighted avg 0.80 0.80 0.80 400

In [41]: pip install --upgrade scikit-learn


Requirement already satisfied: scikit-learn in c:\anaconda\lib\site-packages (1.4.2)


Requirement already satisfied: numpy>=1.19.5 in c:\anaconda\lib\site-packages (from scikit-learn) (1.24.3)
Requirement already satisfied: scipy>=1.6.0 in c:\anaconda\lib\site-packages (from scikit-learn) (1.11.1)
Requirement already satisfied: joblib>=1.2.0 in c:\anaconda\lib\site-packages (from scikit-learn) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\anaconda\lib\site-packages (from scikit-learn) (2.2.0)
Note: you may need to restart the kernel to use updated packages.

In [39]: conda update -c conda-forge scikit-learn



service-identity conda-forge/noarch::service-identity-21.1.0-pyhd8ed1ab_0 
tomli conda-forge/noarch::tomli-2.0.1-pyhd8ed1ab_0
types-python-date~ conda-forge/noarch::types-python-dateutil-2.9.0.20240316-pyhd8ed1ab_0
typing_utils conda-forge/noarch::typing_utils-0.1.0-pyhd8ed1ab_0
ucrt conda-forge/win-64::ucrt-10.0.22621.0-h57928b3_0
uri-template conda-forge/noarch::uri-template-1.3.0-pyhd8ed1ab_0
vc14_runtime conda-forge/win-64::vc14_runtime-14.38.33130-h82b7239_18
webcolors conda-forge/noarch::webcolors-1.13-pyhd8ed1ab_0

The following packages will be REMOVED:

aiofiles-22.1.0-py311haa95532_0
aiosqlite-0.18.0-py311haa95532_0
async-timeout-4.0.2-py311haa95532_0
backcall-0.2.0-pyhd3eb1b0_0
bottleneck-1.3.5-py311h5bb9823_0
brotlipy-0.7.0-py311h2bbff1b_1002
datasets-2.12.0-py311haa95532_0
datashape-0.5.4-py311haa95532_1

heapdict-1.0.1-pyhd3eb1b0 0
In [3]: from sklearn.metrics import plot_confusion_matrix
import warnings
warnings.filterwarnings("ignore")
k= [CNB]
for i in k:
plot_confusion_matrix(i, X_test, y_test)
plt.title(i)
plt.show()

---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
Cell In[3], line 1
----> 1 from sklearn.metrics import plot_confusion_matrix
2 import warnings
3 warnings.filterwarnings("ignore")

ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (C:\anaconda\Lib\site-packages\sklearn


\metrics\__init__.py)

In [2]: def build_tokenizer(self):


"""Return a function that splits a string into a sequence of tokens.
Returns
-------
tokenizer: callable
A function to split a string into a sequence of tokens.
"""
if self.tokenizer is not None:
return self.tokenizer
token_pattern = re.compile(self.token_pattern)
return token_pattern.findall

#‫تستخدم لتحويل النصوص إلى مجموعة من الكلمات أو الرموز المفردة‬.
#‫ فإن الدالة ُترجعها مباشرة‬،‫إذا كانت مفردات التقسيم معينة بالفعل‬

In [ ]: ​

In [ ]: ​

In [ ]: ​

You might also like