In [2]:
# Import the numpy and pandas package
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
# Read the given CSV file, and view some sample records
advertising = pd.read_csv("datasets/ads.csv")
advertising.head()
Out[3]:
TV | Radio | Newspaper | Sales | |
---|---|---|---|---|
0 | 230.1 | 37.8 | 69.2 | 22.1 |
1 | 44.5 | 39.3 | 45.1 | 10.4 |
2 | 17.2 | 45.9 | 69.3 | 12.0 |
3 | 151.5 | 41.3 | 58.5 | 16.5 |
4 | 180.8 | 10.8 | 58.4 | 17.9 |
In [4]:
advertising.shape
Out[4]:
(200, 4)
In [5]:
advertising.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 200 entries, 0 to 199 Data columns (total 4 columns): TV 200 non-null float64 Radio 200 non-null float64 Newspaper 200 non-null float64 Sales 200 non-null float64 dtypes: float64(4) memory usage: 6.4 KB
In [6]:
advertising.isnull().sum()
Out[6]:
TV 0 Radio 0 Newspaper 0 Sales 0 dtype: int64
In [7]:
advertising.describe()
Out[7]:
TV | Radio | Newspaper | Sales | |
---|---|---|---|---|
count | 200.000000 | 200.000000 | 200.000000 | 200.000000 |
mean | 147.042500 | 23.264000 | 30.554000 | 15.130500 |
std | 85.854236 | 14.846809 | 21.778621 | 5.283892 |
min | 0.700000 | 0.000000 | 0.300000 | 1.600000 |
25% | 74.375000 | 9.975000 | 12.750000 | 11.000000 |
50% | 149.750000 | 22.900000 | 25.750000 | 16.000000 |
75% | 218.825000 | 36.525000 | 45.100000 | 19.050000 |
max | 296.400000 | 49.600000 | 114.000000 | 27.000000 |
In [8]:
sns.regplot(x='TV',y='Sales',data=advertising)
plt.show()
In [9]:
sns.regplot(x='Radio',y='Sales',data=advertising)
plt.show()
In [10]:
sns.regplot(x='Newspaper',y='Sales',data=advertising)
plt.show()
In [8]:
sns.pairplot(advertising, x_vars=['TV', 'Newspaper', 'Radio'], y_vars='Sales',size=4, aspect=1, kind='scatter')
plt.show()
c:\users\karan\appdata\local\programs\python\python36\lib\site-packages\seaborn\axisgrid.py:2065: UserWarning: The `size` parameter has been renamed to `height`; pleaes update your code. warnings.warn(msg, UserWarning)
In [9]:
advertising.corr()
Out[9]:
TV | Radio | Newspaper | Sales | |
---|---|---|---|---|
TV | 1.000000 | 0.054809 | 0.056648 | 0.901208 |
Radio | 0.054809 | 1.000000 | 0.354104 | 0.349631 |
Newspaper | 0.056648 | 0.354104 | 1.000000 | 0.157960 |
Sales | 0.901208 | 0.349631 | 0.157960 | 1.000000 |
In [13]:
sns.heatmap(advertising.corr(), cmap="YlGnBu", annot = True)
plt.show()
As is visible from the pairplot and the heatmap, the variable TV seems to be most correlated with Sales. So let’s go ahead and perform simple linear regression using TV as our feature variable.
In [10]:
X = advertising['TV'].values.reshape(-1,1)
y = advertising['Sales'].values
In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.80, test_size = 0.20, random_state = 42)
In [12]:
X_train
Out[12]:
array([[116. ], [177. ], [ 43.1], [ 62.3], [224. ], [ 38.2], [ 70.6], [147.3], [104.6], [ 76.3], [ 78.2], [168.4], [ 8.7], [ 7.8], [ 76.4], [129.4], [ 73.4], [289.7], [ 19.6], [197.6], [284.3], [184.9], [112.9], [ 23.8], [290.7], [ 19.4], [293.6], [ 18.7], [134.3], [ 25.6], [100.4], [ 80.2], [188.4], [177. ], [125.7], [209.6], [142.9], [184.9], [222.4], [241.7], [ 17.2], [120.5], [ 89.7], [191.1], [ 75.5], [193.2], [ 85.7], [266.9], [ 39.5], [261.3], [ 13.2], [193.7], [296.4], [265.6], [214.7], [149.7], [131.7], [ 57.5], [240.1], [141.3], [180.8], [ 97.2], [220.5], [140.3], [255.4], [ 96.2], [ 66.1], [239.3], [175.7], [240.1], [ 17.9], [230.1], [283.6], [171.3], [199.1], [123.1], [131.1], [ 25.1], [163.5], [248.8], [202.5], [ 13.1], [ 4.1], [ 93.9], [262.9], [228.3], [253.8], [243.2], [239.8], [228. ], [215.4], [239.9], [107.4], [187.8], [206.9], [ 43. ], [151.5], [137.9], [182.6], [219.8], [156.6], [276.7], [205. ], [ 66.9], [ 76.4], [ 95.7], [120.2], [225.8], [ 28.6], [ 68.4], [248.4], [218.5], [109.8], [ 8.6], [ 97.5], [210.7], [164.5], [265.2], [281.4], [ 26.8], [276.9], [ 36.9], [206.8], [287.6], [102.7], [262.7], [ 90.4], [199.8], [ 94.2], [210.8], [227.2], [ 88.3], [237.4], [136.2], [172.5], [ 17.2], [ 59.6], [ 74.7], [149.8], [166.8], [ 44.5], [216.4], [ 44.7], [ 0.7], [121. ], [187.9], [135.2], [139.2], [110.7], [213.4], [ 18.8], [232.1], [218.4], [286. ], [109.8], [ 25. ], [204.1], [217.7], [165.6], [280.2]])
In [13]:
y_train
Out[13]:
array([11. , 14.8, 10.1, 9.7, 16.6, 7.6, 10.5, 14.6, 10.4, 12. , 14.6, 16.7, 7.2, 6.6, 9.4, 11. , 10.9, 25.4, 7.6, 16.7, 20. , 20.5, 11.9, 9.2, 17.8, 6.6, 20.7, 6.7, 14. , 9.5, 10.7, 11.9, 19.9, 17.1, 15.9, 20.9, 15. , 20.7, 16.7, 21.8, 12. , 14.2, 10.6, 17.3, 11.9, 20.2, 13.3, 25.4, 10.8, 24.2, 5.6, 19.2, 23.8, 17.4, 17.4, 17.3, 12.9, 11.8, 20.9, 15.5, 17.9, 13.2, 20.1, 10.3, 19.8, 12.3, 12.6, 20.7, 17.1, 18.2, 8. , 22.1, 25.5, 16. , 18.3, 15.2, 16. , 8.5, 18. , 18.9, 16.6, 5.3, 3.2, 15.3, 17. , 20.5, 17.6, 25.4, 17.3, 21.5, 17.1, 23.2, 11.5, 20.6, 17.9, 9.6, 16.5, 15. , 21.2, 19.6, 15.5, 16.8, 22.6, 9.7, 11.8, 11.9, 13.2, 18.4, 7.3, 13.6, 20.2, 17.2, 16.7, 4.8, 13.7, 18.4, 17.5, 17.7, 24.4, 8.8, 27. , 10.8, 17.2, 26.2, 14. , 20.2, 12. , 16.4, 14. , 23.8, 19.8, 12.9, 17.5, 13.2, 16.4, 5.9, 9.7, 14.7, 10.1, 19.6, 10.4, 22.6, 10.1, 1.6, 11.6, 19.7, 17.2, 12.2, 16. , 17. , 7. , 18.4, 18. , 20.9, 12.4, 7.2, 19. , 19.4, 17.6, 19.8])
In [14]:
import xgboost as xgb
xgb_model = xgb.XGBRegressor(objective="reg:squarederror")
xgb_model.fit(X_train,y_train)
Out[14]:
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.300000012, max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0, num_parallel_tree=1, objective='reg:squarederror', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact', validate_parameters=1, verbosity=None)
In [15]:
Yp= xgb_model.predict(X_test)
Yp
Out[15]:
array([17.880438 , 19.160408 , 20.711021 , 5.403233 , 19.767073 , 11.973633 , 22.146915 , 10.172751 , 16.855145 , 16.913172 , 7.488937 , 11.585327 , 18.448017 , 3.2186866, 12.286012 , 16.190283 , 6.4905276, 17.020899 , 11.973633 , 17.717775 , 21.8418 , 13.286675 , 7.768169 , 18.918385 , 13.286675 , 11.585327 , 17.239777 , 12.286012 , 12.900588 , 4.96285 , 16.671322 , 13.286675 , 18.017462 , 8.869742 , 19.99905 , 17.717775 , 10.172751 , 17.09226 , 11.441164 , 8.816983 ], dtype=float32)
In [16]:
xgb_model.score(X_train,y_train) * 100
Out[16]:
98.97085078718804
In [17]:
xgb_model.score(X_test,y_test) * 100
Out[17]:
81.48673425766624
In [ ]: