Download our e-book of Introduction To Python
Shashank Shanu
2 years ago
import pandas as pd
import numpy as np
data = pd.read_csv("Salary_Dataa.csv")
data
data.shape
(30, 2)
data.info()
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 YearsExperience 30 non-none float64
1 Salary 30 non-none float64
dtypes: float64(2)
memory usage: 608.0 bytes
data.isnone().count()
YearsExperience 30
Salary 30
dtype: int64
X = data.iloc[:,:-1].values
Y = data.iloc[:,1].values
X
array([[ 1.1],
[ 1.3],
[ 1.5],
[ 2. ],
[ 2.2],
[ 2.9],
[ 3. ],
[ 3.2],
[ 3.2],
[ 3.7],
[ 3.9],
[ 4. ],
[ 4. ],
[ 4.1],
[ 4.5],
[ 4.9],
[ 5.1],
[ 5.3],
[ 5.9],
[ 6. ],
[ 6.8],
[ 7.1],
[ 7.9],
[ 8.2],
[ 8.7],
[ 9. ],
[ 9.5],
[ 9.6],
[10.3],
[10.5]])
Y
array([ 39343., 46205., 37731., 43525., 39891., 56642., 60150.,
54445., 64445., 57189., 63218., 55794., 56957., 57081.,
61111., 67938., 66029., 83088., 81363., 93940., 91738.,
98273., 101302., 113812., 109431., 105582., 116969., 112635.,
122391., 121872.])
import matplotlib.pyplot as plt
plt.scatter(X,Y,color = "blue")
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.33,random_state = 0)
x_train
array([[ 2.9],
[ 5.1],
[ 3.2],
[ 4.5],
[ 8.2],
[ 6.8],
[ 1.3],
[10.5],
[ 3. ],
[ 2.2],
[ 5.9],
[ 6. ],
[ 3.7],
[ 3.2],
[ 9. ],
[ 2. ],
[ 1.1],
[ 7.1],
[ 4.9],
[ 4. ]])
y_train
array([ 56642., 66029., 64445., 61111., 113812., 91738., 46205.,
121872., 60150., 39891., 81363., 93940., 57189., 54445.,
105582., 43525., 39343., 98273., 67938., 56957.])
y_test
array([ 37731., 122391., 57081., 63218., 116969., 109431., 112635.,
55794., 83088., 101302.])
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train,y_train)
LinearRegression(copy_X=true, fit_intercept=true, n_jobs=none, normalize=false)
y_pred = regressor.predict(x_test)
y_pred
array([ 40835.10590871, 123079.39940819, 65134.55626083, 63265.36777221,
115602.64545369, 108125.8914992 , 116537.23969801, 64199.96201652,
76349.68719258, 100649.1375447 ])
y_test
array([ 37731., 122391., 57081., 63218., 116969., 109431., 112635.,
55794., 83088., 101302.])
residue = y_pred - y_test # residue or error between actual and predicted salary
residue
array([ 3104.10590871, 688.39940819, 8053.55626083, 47.36777221,
-1366.35454631, -1305.1085008 , 3902.23969801, 8405.96201652,
-6738.31280742, -652.8624553 ])
plt.scatter(x_train,y_train,color="red")
plt.plot(x_train,regressor.predict(x_train),color="yellow")
plt.title("Salary VS Experience(Training Set)")
plt.xlabel("Years of Experience")
plt.ylabel("Salary")
plt.show()
plt.scatter(x_test,y_test,color="Red")
plt.plot(x_train,regressor.predict(x_train),color="yellow")
plt.title("Salary VS Experience(Training Set)")
plt.xlabel("Years of Experience")
plt.ylabel("Salary")
plt.show()
print(regressor.coef_)
print(regressor.intercept_)
[9345.94244312]
26816.192244031176
y_test.shape
(10,)
y_pred.shape
(10,)
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test,y_pred) #built-in function r2_score() indicates R-squared value
print("RMSE =", rmse)
print("R2 Score=",r2)
RMSE = 4585.415720467589
R2 Score= 0.9749154407708353