Nothing Special   »   [go: up one dir, main page]

Source Code Python Jemmy

Download as docx, pdf, or txt
Download as docx, pdf, or txt
You are on page 1of 7

1.

pip install google-play-scraper

2.

from google_play_scraper import Sort, reviews


result, continuation_token = reviews(
'com.dts.freefireth',
lang='id', # defaults to 'en'
country='id', # defaults to 'us'
sort=Sort.MOST_RELEVANT, # defaults to Sort.MOST_RELEVANT
count=4000, # defaults to 100

)
result, _ = reviews(
'com.dts.freefireth',
continuation_token=continuation_token # defaults to None(load from the beginning)
)
print(result)

3.

import pandas as pd

df = pd.DataFrame(result)
df.to_csv("D:/TestData11.CSV")

4.
pip install nltk

5.
import nltk
nltk.download()

6.
pip install Sastrawi
7.
pip install numpy

8.
import pandas as pd
import numpy as np

TWEET_DATA = pd.read_csv("D:/data_ff.csv")

TWEET_DATA.head()

9.
TWEET_DATA.to_csv("D:/data_ff.csv")

10.

# ------ Case Folding --------


# gunakan fungsi Series.str.lower() pada Pandas
TWEET_DATA['content'] = TWEET_DATA['content'].str.lower()

print('Case Folding Result : \n')


print(TWEET_DATA['content'].head(5))
print('\n\n\n')

11.

import string
import re #regex library

# import word_tokenize & FreqDist from NLTK


from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# ------ Tokenizing ---------

def remove_tweet_special(text):
# remove tab, new line, ans back slice
text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
# remove non ASCII (emoticon, chinese word, .etc)
text = text.encode('ascii', 'replace').decode('ascii')
# remove mention, link, hashtag
text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
# remove incomplete URL
return text.replace("http://", " ").replace("https://", " ")

TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_tweet_special)

#remove punctuation
def remove_punctuation(text):
return text.translate(str.maketrans("","",string.punctuation))

TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_punctuation)

#remove whitespace leading & trailing


def remove_whitespace_LT(text):
return text.strip()

TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace


def remove_whitespace_multiple(text):
return re.sub('\s+',' ',text)

TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_whitespace_multiple)

# remove single char


def remove_singl_char(text):
return re.sub(r"\b[a-zA-Z]\b", "", text)

TWEET_DATA['content'] = TWEET_DATA['content'].apply(remove_singl_char)

# NLTK word rokenize


def word_tokenize_wrapper(text):
return word_tokenize(text)

TWEET_DATA['content_tokens'] = TWEET_DATA['content'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n')


print(TWEET_DATA['content_tokens'].head())
print('\n\n\n')
11.

# NLTK calc frequency distribution


def freqDist_wrapper(text):
return FreqDist(text)

TWEET_DATA['content_tokens_fdist'] =
TWEET_DATA['content_tokens'].apply(freqDist_wrapper)

print('Frequency Tokens : \n')


print(TWEET_DATA['content_tokens_fdist'].head().apply(lambda x : x.most_common()))

12.

from nltk.corpus import stopwords

# ----------------------- get stopword from NLTK stopword -------------------------------


# get stopword indonesia
list_stopwords = stopwords.words('indonesian')

# ---------------------------- manualy add stopword ------------------------------------


# append additional stopword
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo',
'kalo', 'amp', 'biar', 'bikin', 'bilang',
'gak', 'ga', 'krn', 'nya', 'nih', 'sih',
'si', 'tau', 'tdk', 'tuh', 'utk', 'ya',
'jd', 'jgn', 'sdh', 'aja', 'n', 't',
'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
'&amp', 'yah'])

# ----------------------- add stopword from txt file ------------------------------------


# read txt stopword using pandas
txt_stopword = pd.read_csv("D:/stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword


list_stopwords.extend(txt_stopword["stopwords"][0].split(' '))

# ---------------------------------------------------------------------------------------

# convert list to dictionary


list_stopwords = set(list_stopwords)
#remove stopword pada list token
def stopwords_removal(words):
return [word for word in words if word not in list_stopwords]

TWEET_DATA['content_tokens_WSW'] =
TWEET_DATA['content_tokens'].apply(stopwords_removal)

print(TWEET_DATA['content_tokens_WSW'].head())

13.

normalizad_word = pd.read_excel("D:/normalisasi.xlsx")

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():


if row[0] not in normalizad_word_dict:
normalizad_word_dict[row[0]] = row[1]

def normalized_term(document):
return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in
document]

TWEET_DATA['content_normalized'] =
TWEET_DATA['content_tokens_WSW'].apply(normalized_term)

TWEET_DATA['content_normalized'].head(10)

14.

conda install -c conda-forge swifter

15.

# import Sastrawi package


from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
return stemmer.stem(term)

term_dict = {}

for document in TWEET_DATA['content_normalized']:


for term in document:
if term not in term_dict:
term_dict[term] = ' '

print(len(term_dict))
print("------------------------")

for term in term_dict:


term_dict[term] = stemmed_wrapper(term)
print(term,":" ,term_dict[term])

print(term_dict)
print("------------------------")

# apply stemmed term to dataframe


def get_stemmed_term(document):
return [term_dict[term] for term in document]

TWEET_DATA['content_tokens_stemmed'] =
TWEET_DATA['content_normalized'].swifter.apply(get_stemmed_term)
print(TWEET_DATA['content_tokens_stemmed'])

16.

TWEET_DATA.to_csv("data_ff.csv")
17.
TWEET_DATA.to_excel("data_ff.xlsx")

You might also like