Nothing Special   »   [go: up one dir, main page]

Crop - Recom - Jupyter Notebook

Download as pdf or txt
Download as pdf or txt
You are on page 1of 6

LIBRARIES

In [1]: import pandas as pd


import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import warnings

In [56]: dt = pd.read_csv("/content/Crop_recommendation.csv")

In [57]: dt.head()

Out[57]: Nitrogen phosphorus potassium temperature humidity ph rainfall label Unnamed: 8 Unnamed: 9

0 90 42 43 20.879744 82.002744 6.502985 202.935536 rice NaN NaN

1 85 58 41 21.770462 80.319644 7.038096 226.655537 rice NaN NaN

2 60 55 44 23.004459 82.320763 7.840207 263.964248 rice NaN NaN

3 74 35 40 26.491096 80.158363 6.980401 242.864034 rice NaN NaN

4 78 42 42 20.130175 81.604873 7.628473 262.717340 rice NaN NaN

PRE-PROCESSING

In [58]: dt.shape

Out[58]: (2200, 10)

In [59]: dt.columns

Out[59]: Index(['Nitrogen', 'phosphorus', 'potassium', 'temperature', 'humidity', 'ph',


'rainfall', 'label', 'Unnamed: 8', 'Unnamed: 9'],
dtype='object')

In [60]: dt.isnull().any()

Out[60]: Nitrogen False


phosphorus False
potassium False
temperature False
humidity False
ph False
rainfall False
label False
Unnamed: 8 True
Unnamed: 9 True
dtype: bool

In [61]: dt.isnull().sum()

Out[61]: Nitrogen 0
phosphorus 0
potassium 0
temperature 0
humidity 0
ph 0
rainfall 0
label 0
Unnamed: 8 2200
Unnamed: 9 2200
dtype: int64
In [62]: dt['label'].value_counts()

Out[62]: rice 100


maize 100
jute 100
cotton 100
coconut 100
papaya 100
orange 100
apple 100
muskmelon 100
watermelon 100
grapes 100
mango 100
banana 100
pomegranate 100
lentil 100
blackgram 100
mungbean 100
mothbeans 100
pigeonpeas 100
kidneybeans 100
chickpea 100
coffee 100
Name: label, dtype: int64

In [63]: crop_summary = pd.pivot_table(dt,index=['label'],aggfunc='mean')


crop_summary.head()

Out[63]: Nitrogen humidity ph phosphorus potassium rainfall temperature

label

apple 20.80 92.333383 5.929663 134.22 199.89 112.654779 22.630942

banana 100.23 80.358123 5.983893 82.01 50.05 104.626980 27.376798

blackgram 40.02 65.118426 7.133952 67.47 19.24 67.884151 29.973340

chickpea 40.09 16.860439 7.336957 67.79 79.92 80.058977 18.872847

coconut 21.98 94.844272 5.976562 16.93 30.59 175.686646 27.409892

In [64]: crop = dt.drop(['Unnamed: 8','Unnamed: 9'],axis=1)

In [65]: crop.head()

Out[65]: Nitrogen phosphorus potassium temperature humidity ph rainfall label

0 90 42 43 20.879744 82.002744 6.502985 202.935536 rice

1 85 58 41 21.770462 80.319644 7.038096 226.655537 rice

2 60 55 44 23.004459 82.320763 7.840207 263.964248 rice

3 74 35 40 26.491096 80.158363 6.980401 242.864034 rice

4 78 42 42 20.130175 81.604873 7.628473 262.717340 rice

In [66]: from dataprep.datasets import load_dataset


from dataprep.eda import create_report
create_report(crop)

Computing series-max-agg-7b3faefce5a463bc854ddeacd69d6f1f: 0%| | 0/1458 [00:00<?, ?it/s]/usr/local/li


b/python3.10/dist-packages/dask/core.py:121: RuntimeWarning: invalid value encountered in divide
return func(*(_execute_task(a, cache) for a in args))
/usr/local/lib/python3.10/dist-packages/dataprep/eda/distribution/render.py:274: FutureWarning: The frame.append
method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))

Out[66]: DataPrep Report


DataPrep Report Overview
Variables ≡
Nitrogen phosphorus potassium temperature humidity ph rainfall label
Interactions Correlations Missing Values

Overview
Dataset Statistics
Number of Variables 8

DATA SPLITTING

In [67]: X = crop.drop('label',axis=1)
Y = crop['label']
In [68]: X.head()

Out[68]: Nitrogen phosphorus potassium temperature humidity ph rainfall

0 90 42 43 20.879744 82.002744 6.502985 202.935536

1 85 58 41 21.770462 80.319644 7.038096 226.655537

2 60 55 44 23.004459 82.320763 7.840207 263.964248

3 74 35 40 26.491096 80.158363 6.980401 242.864034

4 78 42 42 20.130175 81.604873 7.628473 262.717340

FEATURE SELECTION

In [69]: from sklearn.feature_selection import SelectKBest


from sklearn.feature_selection import chi2

In [70]: ordered_rank_features=SelectKBest(score_func=chi2,k=7)
ordered_feature=ordered_rank_features.fit(X,y)

In [71]: dtscores=pd.DataFrame(ordered_feature.scores_,columns=["Score"])
dtcolumns=pd.DataFrame(X.columns)

In [72]: features_rank=pd.concat([dtcolumns,dtscores],axis=1)

In [73]: features_rank.columns=['Features','Score']
features_rank

Out[73]: Features Score

0 Nitrogen 51393.681526

1 phosphorus 30248.326329

2 potassium 68889.682991

3 temperature 1057.631896

4 humidity 14147.237724

5 ph 70.382302

6 rainfall 54726.482814

In [74]: features_rank.nlargest(10,'Score')

Out[74]: Features Score

2 potassium 68889.682991

6 rainfall 54726.482814

0 Nitrogen 51393.681526

1 phosphorus 30248.326329

4 humidity 14147.237724

3 temperature 1057.631896

5 ph 70.382302

Feature Importance

In [75]: from sklearn.ensemble import ExtraTreesClassifier


#import matplotlib.pyplot as plt
model=ExtraTreesClassifier()
model.fit(X,Y)

Out[75]: ExtraTreesClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [76]: print(model.feature_importances_)

[0.11747677 0.14029234 0.18605198 0.08597486 0.20970977 0.05619725


0.20429702]
In [77]: ranked_features=pd.Series(model.feature_importances_,index=X.columns)
ranked_features.nlargest(10).plot(kind='barh')
plt.show()

INFORMATION GAIN

In [78]: from sklearn.feature_selection import mutual_info_classif

In [79]: mutual_info=mutual_info_classif(X,Y)

In [80]: mutual_data=pd.Series(mutual_info,index=X.columns)
mutual_data.sort_values(ascending=False)

Out[80]: humidity 1.729954


potassium 1.648949
rainfall 1.637358
phosphorus 1.292586
temperature 1.017901
Nitrogen 0.990584
ph 0.686067
dtype: float64

In [84]: from sklearn.model_selection import train_test_split


X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,shuffle =True,
random_state=0)

In [85]: X_train.shape

Out[85]: (1760, 7)

In [86]: X_test.shape

Out[86]: (440, 7)

In [88]: import lazypredict


from lazypredict.Supervised import LazyClassifier
In [89]: # Defines and builds the lazyclassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models_train,predictions_train = clf.fit(X_train, X_train, Y_train, Y_train)
models_test,predictions_test = clf.fit(X_train, X_test, Y_train, Y_test)

# Prints the model performance


models_train

90%|████████▉ | 26/29 [00:09<00:00, 4.76it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1334
[LightGBM] [Info] Number of data points in the train set: 1760, number of used features: 7
[LightGBM] [Info] Start training from score -3.066350
[LightGBM] [Info] Start training from score -3.066350
[LightGBM] [Info] Start training from score -3.116360
[LightGBM] [Info] Start training from score -3.129264
[LightGBM] [Info] Start training from score -3.030418
[LightGBM] [Info] Start training from score -3.054228
[LightGBM] [Info] Start training from score -3.042252
[LightGBM] [Info] Start training from score -3.066350
[LightGBM] [Info] Start training from score -3.103621
[LightGBM] [Info] Start training from score -3.091042
[LightGBM] [Info] Start training from score -3.054228
[LightGBM] [Info] Start training from score -3.066350
[LightGBM] [Info] Start training from score -3.103621
[LightGBM] [Info] Start training from score -3.155581
In [ ]:

In [94]: from sklearn.ensemble import RandomForestClassifier #for the model


from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz #plot tree
from sklearn import metrics
from sklearn.metrics import roc_curve, auc #for model evaluation
from sklearn.metrics import classification_report #for model evaluation
from sklearn.metrics import confusion_matrix #for model evaluation
from sklearn.metrics import accuracy_score
from sklearn import svm

In [96]: forest = RandomForestClassifier(n_estimators=100,max_depth=5,bootstrap=True,oob_score=False,criterion='gini')


forest.fit(X_train, Y_train)
warnings.simplefilter('ignore')
print(f"Accuracy of Test Dataset: {forest.score(X_test,Y_test):0.3f}")
print(f"Accuracy of Train Dataset: {forest.score(X_train,Y_train):0.3f}")

Accuracy of Test Dataset: 0.986


Accuracy of Train Dataset: 0.993

In [103]: import pickle


# open a file, where you ant to store the data
file = open('model.pkl', 'wb')

# dump information to that file


pickle.dump(forest, file)

In [105]: y_predict = model.predict(X_test)


y_pred_quant = model.predict_proba(X_test)[:, 1]
y_pred_bin = model.predict(X_test)
In [106]: confusion_matrix = confusion_matrix(Y_test, y_pred_bin)
confusion_matrix

Out[106]: array([[18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 23,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
23, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 21, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 22, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 23, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 25, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 17]])

In [107]: total=sum(sum(confusion_matrix))

sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print('Sensitivity : ', sensitivity )

specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print('Specificity : ', specificity)

Sensitivity : 1.0
Specificity : 1.0

You might also like