Crop - Recom - Jupyter Notebook

LIBRARIES
In [1]: import pandas as pd

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import warnings
In [56]: dt = pd.read_csv("/content/Crop_recommendation.csv")
In [57]: dt.head()
Out[57]: Nitrogen phosphorus potassium temperature humidity ph rainfall label Unnamed: 8 Unnamed: 9
0 90 42 43 20.879744 82.002744 6.502985 202.935536 rice NaN NaN
1 85 58 41 21.770462 80.319644 7.038096 226.655537 rice NaN NaN
2 60 55 44 23.004459 82.320763 7.840207 263.964248 rice NaN NaN
3 74 35 40 26.491096 80.158363 6.980401 242.864034 rice NaN NaN
4 78 42 42 20.130175 81.604873 7.628473 262.717340 rice NaN NaN
PRE-PROCESSING
In [58]: dt.shape
Out[58]: (2200, 10)
In [59]: dt.columns
Out[59]: Index(['Nitrogen', 'phosphorus', 'potassium', 'temperature', 'humidity', 'ph',

'rainfall', 'label', 'Unnamed: 8', 'Unnamed: 9'],
dtype='object')
In [60]: dt.isnull().any()
Out[60]: Nitrogen False

phosphorus False
potassium False
temperature False
humidity False
ph False
rainfall False
label False
Unnamed: 8 True
Unnamed: 9 True
dtype: bool
In [61]: dt.isnull().sum()
Out[61]: Nitrogen 0
phosphorus 0
potassium 0
temperature 0
humidity 0
ph 0
rainfall 0
label 0
Unnamed: 8 2200
Unnamed: 9 2200
dtype: int64
In [62]: dt['label'].value_counts()
Out[62]: rice 100

maize 100
jute 100
cotton 100
coconut 100
papaya 100
orange 100
apple 100
muskmelon 100
watermelon 100
grapes 100
mango 100
banana 100
pomegranate 100
lentil 100
blackgram 100
mungbean 100
mothbeans 100
pigeonpeas 100
kidneybeans 100
chickpea 100
coffee 100
Name: label, dtype: int64
In [63]: crop_summary = pd.pivot_table(dt,index=['label'],aggfunc='mean')

crop_summary.head()
Out[63]: Nitrogen humidity ph phosphorus potassium rainfall temperature
label
apple 20.80 92.333383 5.929663 134.22 199.89 112.654779 22.630942
banana 100.23 80.358123 5.983893 82.01 50.05 104.626980 27.376798
blackgram 40.02 65.118426 7.133952 67.47 19.24 67.884151 29.973340
chickpea 40.09 16.860439 7.336957 67.79 79.92 80.058977 18.872847
coconut 21.98 94.844272 5.976562 16.93 30.59 175.686646 27.409892
In [64]: crop = dt.drop(['Unnamed: 8','Unnamed: 9'],axis=1)
In [65]: crop.head()
Out[65]: Nitrogen phosphorus potassium temperature humidity ph rainfall label
0 90 42 43 20.879744 82.002744 6.502985 202.935536 rice
1 85 58 41 21.770462 80.319644 7.038096 226.655537 rice
2 60 55 44 23.004459 82.320763 7.840207 263.964248 rice
3 74 35 40 26.491096 80.158363 6.980401 242.864034 rice
4 78 42 42 20.130175 81.604873 7.628473 262.717340 rice
In [66]: from dataprep.datasets import load_dataset

from dataprep.eda import create_report
create_report(crop)
Computing series-max-agg-7b3faefce5a463bc854ddeacd69d6f1f: 0%| | 0/1458 [00:00<?, ?it/s]/usr/local/li

b/python3.10/dist-packages/dask/core.py:121: RuntimeWarning: invalid value encountered in divide
return func(*(_execute_task(a, cache) for a in args))
/usr/local/lib/python3.10/dist-packages/dataprep/eda/distribution/render.py:274: FutureWarning: The frame.append
method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df = df.append(pd.DataFrame({col: [nrows - npresent]}, index=["Others"]))
Out[66]: DataPrep Report

DataPrep Report Overview
Variables ≡
Nitrogen phosphorus potassium temperature humidity ph rainfall label
Interactions Correlations Missing Values
Overview
Dataset Statistics
Number of Variables 8
DATA SPLITTING
In [67]: X = crop.drop('label',axis=1)
Y = crop['label']
In [68]: X.head()
Out[68]: Nitrogen phosphorus potassium temperature humidity ph rainfall
0 90 42 43 20.879744 82.002744 6.502985 202.935536
1 85 58 41 21.770462 80.319644 7.038096 226.655537
2 60 55 44 23.004459 82.320763 7.840207 263.964248
3 74 35 40 26.491096 80.158363 6.980401 242.864034
4 78 42 42 20.130175 81.604873 7.628473 262.717340
FEATURE SELECTION
In [69]: from sklearn.feature_selection import SelectKBest

from sklearn.feature_selection import chi2
In [70]: ordered_rank_features=SelectKBest(score_func=chi2,k=7)
ordered_feature=ordered_rank_features.fit(X,y)
In [71]: dtscores=pd.DataFrame(ordered_feature.scores_,columns=["Score"])
dtcolumns=pd.DataFrame(X.columns)
In [72]: features_rank=pd.concat([dtcolumns,dtscores],axis=1)
In [73]: features_rank.columns=['Features','Score']
features_rank
Out[73]: Features Score
0 Nitrogen 51393.681526
1 phosphorus 30248.326329
2 potassium 68889.682991
3 temperature 1057.631896
4 humidity 14147.237724
5 ph 70.382302
6 rainfall 54726.482814
In [74]: features_rank.nlargest(10,'Score')
Out[74]: Features Score
2 potassium 68889.682991
6 rainfall 54726.482814
0 Nitrogen 51393.681526
1 phosphorus 30248.326329
4 humidity 14147.237724
3 temperature 1057.631896
5 ph 70.382302
Feature Importance
In [75]: from sklearn.ensemble import ExtraTreesClassifier

#import matplotlib.pyplot as plt
model=ExtraTreesClassifier()
model.fit(X,Y)
Out[75]: ExtraTreesClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
In [76]: print(model.feature_importances_)
[0.11747677 0.14029234 0.18605198 0.08597486 0.20970977 0.05619725

0.20429702]
In [77]: ranked_features=pd.Series(model.feature_importances_,index=X.columns)
ranked_features.nlargest(10).plot(kind='barh')
plt.show()
INFORMATION GAIN
In [78]: from sklearn.feature_selection import mutual_info_classif
In [79]: mutual_info=mutual_info_classif(X,Y)
In [80]: mutual_data=pd.Series(mutual_info,index=X.columns)
mutual_data.sort_values(ascending=False)
Out[80]: humidity 1.729954

potassium 1.648949
rainfall 1.637358
phosphorus 1.292586
temperature 1.017901
Nitrogen 0.990584
ph 0.686067
dtype: float64
In [84]: from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,shuffle =True,
random_state=0)
In [85]: X_train.shape
Out[85]: (1760, 7)
In [86]: X_test.shape
Out[86]: (440, 7)
In [88]: import lazypredict

from lazypredict.Supervised import LazyClassifier
In [89]: # Defines and builds the lazyclassifier
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models_train,predictions_train = clf.fit(X_train, X_train, Y_train, Y_train)
models_test,predictions_test = clf.fit(X_train, X_test, Y_train, Y_test)
# Prints the model performance

models_train
90%|████████▉ | 26/29 [00:09<00:00, 4.76it/s]
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1334
[LightGBM] [Info] Number of data points in the train set: 1760, number of used features: 7
[LightGBM] [Info] Start training from score -3.066350
In [ ]:
In [94]: from sklearn.ensemble import RandomForestClassifier #for the model

from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz #plot tree
from sklearn import metrics
from sklearn.metrics import roc_curve, auc #for model evaluation
from sklearn.metrics import classification_report #for model evaluation
from sklearn.metrics import confusion_matrix #for model evaluation
from sklearn.metrics import accuracy_score
from sklearn import svm
In [96]: forest = RandomForestClassifier(n_estimators=100,max_depth=5,bootstrap=True,oob_score=False,criterion='gini')

forest.fit(X_train, Y_train)
warnings.simplefilter('ignore')
print(f"Accuracy of Test Dataset: {forest.score(X_test,Y_test):0.3f}")
print(f"Accuracy of Train Dataset: {forest.score(X_train,Y_train):0.3f}")
Accuracy of Test Dataset: 0.986

Accuracy of Train Dataset: 0.993
In [103]: import pickle

# open a file, where you ant to store the data
file = open('model.pkl', 'wb')
# dump information to that file

pickle.dump(forest, file)
In [105]: y_predict = model.predict(X_test)

y_pred_quant = model.predict_proba(X_test)[:, 1]
y_pred_bin = model.predict(X_test)
In [106]: confusion_matrix = confusion_matrix(Y_test, y_pred_bin)
confusion_matrix
Out[106]: array([[18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 18, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 0,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 23,
0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
23, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 21, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 22, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 23, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 25, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 17]])
In [107]: total=sum(sum(confusion_matrix))
sensitivity = confusion_matrix[0,0]/(confusion_matrix[0,0]+confusion_matrix[1,0])
print('Sensitivity : ', sensitivity )
specificity = confusion_matrix[1,1]/(confusion_matrix[1,1]+confusion_matrix[0,1])
print('Specificity : ', specificity)
Sensitivity : 1.0
Specificity : 1.0

Crop - Recom - Jupyter Notebook

Uploaded by

Copyright:

Available Formats

Crop - Recom - Jupyter Notebook

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Crop - Recom - Jupyter Notebook

Uploaded by

Copyright:

Available Formats

LIBRARIES

In [1]: import pandas as pd

0 90 42 43 20.879744 82.002744 6.502985 202.935536 rice NaN NaN

1 85 58 41 21.770462 80.319644 7.038096 226.655537 rice NaN NaN

2 60 55 44 23.004459 82.320763 7.840207 263.964248 rice NaN NaN

3 74 35 40 26.491096 80.158363 6.980401 242.864034 rice NaN NaN

4 78 42 42 20.130175 81.604873 7.628473 262.717340 rice NaN NaN

Out[58]: (2200, 10)

Out[59]: Index(['Nitrogen', 'phosphorus', 'potassium', 'temperature', 'humidity', 'ph',

Out[60]: Nitrogen False

Out[62]: rice 100

In [63]: crop_summary = pd.pivot_table(dt,index=['label'],aggfunc='mean')

Out[63]: Nitrogen humidity ph phosphorus potassium rainfall temperature

apple 20.80 92.333383 5.929663 134.22 199.89 112.654779 22.630942

banana 100.23 80.358123 5.983893 82.01 50.05 104.626980 27.376798

blackgram 40.02 65.118426 7.133952 67.47 19.24 67.884151 29.973340

chickpea 40.09 16.860439 7.336957 67.79 79.92 80.058977 18.872847

coconut 21.98 94.844272 5.976562 16.93 30.59 175.686646 27.409892

In [64]: crop = dt.drop(['Unnamed: 8','Unnamed: 9'],axis=1)

Out[65]: Nitrogen phosphorus potassium temperature humidity ph rainfall label

0 90 42 43 20.879744 82.002744 6.502985 202.935536 rice

1 85 58 41 21.770462 80.319644 7.038096 226.655537 rice

2 60 55 44 23.004459 82.320763 7.840207 263.964248 rice

3 74 35 40 26.491096 80.158363 6.980401 242.864034 rice

4 78 42 42 20.130175 81.604873 7.628473 262.717340 rice

In [66]: from dataprep.datasets import load_dataset

Computing series-max-agg-7b3faefce5a463bc854ddeacd69d6f1f: 0%| | 0/1458 [00:00<?, ?it/s]/usr/local/li

Out[66]: DataPrep Report

Out[68]: Nitrogen phosphorus potassium temperature humidity ph rainfall

0 90 42 43 20.879744 82.002744 6.502985 202.935536

1 85 58 41 21.770462 80.319644 7.038096 226.655537

2 60 55 44 23.004459 82.320763 7.840207 263.964248

3 74 35 40 26.491096 80.158363 6.980401 242.864034

4 78 42 42 20.130175 81.604873 7.628473 262.717340

In [69]: from sklearn.feature_selection import SelectKBest

Out[73]: Features Score

Out[74]: Features Score

In [75]: from sklearn.ensemble import ExtraTreesClassifier

[0.11747677 0.14029234 0.18605198 0.08597486 0.20970977 0.05619725

In [78]: from sklearn.feature_selection import mutual_info_classif

Out[80]: humidity 1.729954

In [84]: from sklearn.model_selection import train_test_split

In [88]: import lazypredict

# Prints the model performance

90%|████████▉ | 26/29 [00:09<00:00, 4.76it/s]

In [94]: from sklearn.ensemble import RandomForestClassifier #for the model

In [96]: forest = RandomForestClassifier(n_estimators=100,max_depth=5,bootstrap=True,oob_score=False,criterion='gini')

Accuracy of Test Dataset: 0.986

In [103]: import pickle

# dump information to that file

In [105]: y_predict = model.predict(X_test)

You might also like