Nothing Special   »   [go: up one dir, main page]

Project 8 Predictive Analytics - Ipynb - Colaboratory

Download as pdf or txt
Download as pdf or txt
You are on page 1of 8

import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('https://github.com/YBI-Foundation/Dataset/raw/main/MPG.csv')

df.head()

mpg cylinders displacement horsepower weight acceleration model_year

0 18.0 8 307.0 130.0 3504 12.0 70

1 15.0 8 350.0 165.0 3693 11.5 70

2 18.0 8 318.0 150.0 3436 11.0 70

3 16.0 8 304.0 150.0 3433 12.0 70

4 17.0 8 302.0 140.0 3449 10.5 70

Saved successfully!

df.nunique()

mpg 129
cylinders 5
displacement 82
horsepower 93
weight 351
acceleration 95
model_year 13
origin 3
name 305
dtype: int64

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 mpg 398 non-null float64
1 cylinders 398 non-null int64
2 displacement 398 non-null float64
3 horsepower 392 non-null float64
4 weight 398 non-null int64
5 acceleration 398 non-null float64
6 model_year 398 non-null int64
7 origin 398 non-null object
8 name 398 non-null object
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB

df.describe()

mpg cylinders displacement horsepower weight acceleration

count 398.000000 398.000000 398.000000 392.000000 398.000000 398.000000

mean 23.514573 5.454774 193.425879 104.469388 2970.424623 15.568090

std 7.815984 1.701004 104.269838 38.491160 846.841774 2.757689

min 9.000000 3.000000 68.000000 46.000000 1613.000000 8.000000

25% 17.500000 4.000000 104.250000 75.000000 2223.750000 13.825000

50% 23.000000 4.000000 148.500000 93.500000 2803.500000 15.500000

75% 29.000000 8.000000 262.000000 126.000000 3608.000000 17.175000

max 46.600000 8.000000 455.000000 230.000000 5140.000000 24.800000

Saved successfully!

df.corr()
df= df.dropna()

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 mpg 392 non-null float64
1 cylinders 392 non-null int64
2 displacement 392 non-null float64
3 horsepower 392 non-null float64
4 weight 392 non-null int64
5 acceleration 392 non-null float64
6 model_year 392 non-null int64
7 origin 392 non-null object
8 name 392 non-null object
dtypes: float64(4), int64(3), object(2)
memory usage: 30.6+ KB

sns.pairplot(df,x_vars=['displacement','horsepower','weight','acceleration','mpg'])

<seaborn.axisgrid.PairGrid at 0x7f2fcdd60190>

Saved successfully!

sns.regplot(x= 'displacement',y='mpg',data = df);


df.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',


'acceleration', 'model_year', 'origin', 'name'],
dtype='object')

y = df['mpg']

y.shape

(392,)

X = df[['displacement','horsepower','weight','acceleration']]

X.shape

(392, 4)

X
Saved successfully!
displacement horsepower weight acceleration

0 307.0 130.0 3504 12.0

1 350 0 165 0 3693 11 5

from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

X = ss.fit_transform(X)

array([[ 1.07728956, 0.66413273, 0.62054034, -1.285258 ],


[ 1.48873169, 1.57459447, 0.84333403, -1.46672362],
[ 1.1825422 , 1.18439658, 0.54038176, -1.64818924],
...,
[-0.56847897, -0.53247413, -0.80463202, -1.4304305 ],
[-0.7120053 , -0.66254009, -0.41562716, 1.11008813],
[-0.72157372, -0.58450051, -0.30364091, 1.40043312]])

pd.DataFrame(X).describe()

0 1 2 3

count 3.920000e+02 3.920000e+02 3.920000e+02 3.920000e+02

mean -2.537653e-16 -4.392745e-16 5.607759e-17 6.117555e-16


Saved successfully!
std 1.001278e+00 1.001278e+00 1.001278e+00 1.001278e+00

min -1.209563e+00 -1.520975e+00 -1.608575e+00 -2.736983e+00

25% -8.555316e-01 -7.665929e-01 -8.868535e-01 -6.410551e-01

50% -4.153842e-01 -2.853488e-01 -2.052109e-01 -1.499869e-02

75% 7.782764e-01 5.600800e-01 7.510927e-01 5.384714e-01

max 2.493416e+00 3.265452e+00 2.549061e+00 3.360262e+00

from sklearn.model_selection import train_test_split

X_train, X_test, y_train , y_test = train_test_split(X,y,train_size=0.7,random_state=2529)

X_train.shape , X_test.shape , y_train.shape , y_test.shape


((274, 4), (118, 4), (274,), (118,))

from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(X_train, y_train)

LinearRegression()

lr.intercept_

23.485738559737584

lr.coef_

array([-1.05767743, -1.68734727, -4.10787617, -0.11495177])

y_pred = lr.predict(X_test)

y_pred

array([18.51865637, 15.09305675, 14.30128789, 23.6753321 , 29.7546115 ,


23.68796629, 26.61066644, 24.56692437, 15.06260986, 11.94312046,
24.08050053, 27.96518468, 31.66130278, 31.01309132, 18.32428976,
19.32795009, 28.08847536, 32.1506879 , 31.15859692, 27.15792144,
18.82433097, 22.54580176, 26.15598115, 32.36393869, 20.74377679,
Saved successfully!
8.78027518, 22.19699435, 18.20614294, 25.00052718, 15.26421552,
23.13441082, 17.10542257, 9.87180062, 30.00790415, 20.41204655,
29.11860245, 24.4305187 , 21.72601835, 10.51174626, 13.12426391,
21.41938406, 19.96113872, 6.19146626, 17.79025345, 22.5493033 ,
29.34765021, 13.4861847 , 25.88852083, 29.40406946, 22.41841964,
22.07684766, 16.46575802, 24.06290693, 30.12890046, 10.11318121,
9.85011438, 28.07543852, 23.41426617, 20.08501128, 30.68234133,
20.92026393, 26.78370281, 22.9078744 , 14.15936872, 24.6439883 ,
26.95515832, 15.25709393, 24.11272087, 30.80980589, 14.9770217 ,
27.67836372, 24.2372919 , 10.92177228, 30.22858779, 30.88687365,
27.33992044, 31.18447082, 10.8873597 , 27.63510608, 16.49231363,
25.63229888, 29.49776285, 14.90393439, 32.78670687, 30.37325244,
30.9262743 , 14.71702373, 27.09633246, 26.69933806, 29.06424799,
32.45810182, 29.44846898, 31.61239999, 31.57891837, 21.46542321,
31.76739191, 26.28605476, 28.96419915, 31.09628395, 24.80549594,
18.76490961, 23.28043777, 23.04466919, 22.14143162, 15.95854367,
28.62870918, 25.58809869, 11.4040908 , 25.73334842, 30.83500051,
21.94176255, 15.34532941, 30.37399213, 28.7620624 , 29.3639931 ,
29.10476703, 20.44662365, 28.11466839])

from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score


mean_absolute_error(y_test, y_pred)

3.3286968643244106

mean_absolute_percentage_error(y_test, y_pred)

0.14713035779536746

r2_score(y_test, y_pred)

0.7031250746717692

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)

X_train2 = poly.fit_transform(X_train)

X_test2 = poly.fit_transform(X_test)

lr.fit(X_train2, y_train)

LinearRegression()

lr.intercept_
Saved successfully!
21.27336450063766

lr.coef_

array([-2.76070596, -5.00559628, -1.36884133, -0.81225214, 1.24596571,


-0.12475017, -0.90542822, 1.35064048, -0.17337823, 1.41680398])

y_pred_poly = lr.predict(X_test2)

from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score

mean_absolute_error(y_test, y_pred_poly)

2.7887147720295977

mean_absolute_percentage_error(y_test, y_pred_poly)

0.1207401834293869
r2_score(y_test, y_pred_poly)

0.7461731314563803

Saved successfully!

You might also like