from google.colab import files
uploaded = files.upload()

Saving housing_data_large.csv to housing_data_large.csv

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
df = pd.read_csv("housing_data_large.csv")
df.head(5)

area = df['Area (sqft)']
rent = df ['Rent (USD)']
# Filter all houses with rent > 2000
high_rent = df[df["Rent (USD)"] > 2000]
print(high_rent["Rent (USD)"].mean())

2333.52

x = np.array(area).reshape(-1,1)
y = np.array(rent)

print(x,y)

[[2100]
 [ 910]
 [2163]
 [2999]
 [1474]
 [2159]
 [ 971]
 [2534]
 [1180]
 [2988]
 [1786]
 [3461]
 [2629]
 [3377]
 [ 984]
 [2285]
 [1488]
 [1880]
 [ 953]
 [3146]
 [2771]
 [2201]
 [1670]
 [3239]
 [2867]
 [1907]
 [2810]
 [2134]
 [1368]
 [2211]
 [2472]
 [3073]
 [ 869]
 [3344]
 [3491]
 [2243]
 [2980]
 [2949]
 [1868]
 [3013]
 [2686]
 [3212]
 [2665]
 [1774]
 [2906]
 [3062]
 [1504]
 [2644]
 [1266]
 [2789]] [2144  897 1574 2360 1494 1831 1286 1959 1546 2057 1729 2853 2041 2513
  958 1887 1392 1419  652 2685 2073 1820 1730 2453 2136 1881 2260 2060
  925 1793 2111 2261 1112 2491 2231 1728 2628 2237 1672 2712 2096 2444
 2335 1657 2512 2505 1547 2140 1288 1915]

import matplotlib.pyplot as plt

model = LinearRegression()
model.fit(x,y)
print("Slope (coefficient):", model.coef_[0])
print("Intercept:", model.intercept_)

plt.scatter(x,y, color='green', label='actual data')
plt.hlines(np.mean(y), xmin=min(x), xmax=max(x), colors='blue', linestyles='dashed', label='Average line')
plt.plot(x,model.predict(x),color='black', label="predicted line")
plt.title("House rent against Area")
plt.xlabel("Sq Ft Area")
plt.ylabel("rent")
plt.legend()
plt.show()

Slope (coefficient): 0.6190317188191818
Intercept: 491.1938580746271

#making predictions
model.predict([[2701],[900]])

array([2163.19853061, 1048.32240501])

x_full = np.linspace(x.min(),x.max(),1000).reshape(-1,1)
y_pred = model.predict(x)
#let's print the zip value first for proper understanding
for xi,yi, yp in zip(x,y,y_pred):
   print (xi,yi,yp)
   plt.plot([xi,xi],[yi,yp],color='red',linestyle='--')
plt.plot(x,y_pred, color='black', label='Prediction Line')
# Plot actual points
plt.scatter(x, y, color='green', label='Actual Points')
plt.legend()

[2100] 2144 1791.160467594909
[910] 897 1054.5127222000824
[2163] 1574 1830.1594658805175
[2999] 2360 2347.6699828133533
[1474] 1494 1403.6466116141012
[2159] 1831 1827.6833390052407
[971] 1286 1092.2736570480527
[2534] 1959 2059.8202335624337
[1180] 1546 1221.6512862812615
[2988] 2057 2340.8606339063426
[1786] 1729 1596.7845078856858
[3461] 2853 2633.662636907816
[2629] 2041 2118.628246850256
[3377] 2513 2581.6639725270043
[984] 958 1100.3210693927022
[2285] 1887 1905.6813355764575
[1488] 1392 1412.3130556775695
[1880] 1419 1654.973489454689
[953] 652 1081.1310861093075
[3146] 2685 2438.6676454797735
[2771] 2073 2206.53075092258
[2201] 1820 1853.6826711956464
[1670] 1730 1524.9768285026607
[3239] 2453 2496.237595329957
[2867] 2136 2265.9577959292214
[1907] 1881 1671.6873458628068
[2810] 2260 2230.6729879565282
[2134] 2060 1812.2075460347612
[1368] 925 1338.029249419268
[2211] 1793 1859.8729883838382
[2472] 2111 2021.4402669956446
[3073] 2261 2393.4783300059726
[869] 1112 1029.132421728496
[3344] 2491 2561.2359258059714
[3491] 2231 2652.2335884723907
[2243] 1728 1879.682003386052
[2980] 2628 2335.908380155789
[2949] 2237 2316.7183968723944
[1868] 1672 1647.5451088288587
[3013] 2712 2356.336426876822
[2686] 2096 2153.9130548229496
[3212] 2444 2479.523738921839
[2665] 2335 2140.9133887277467
[1774] 1657 1589.3561272598556
[2906] 2512 2290.1000329631697
[3062] 2505 2386.668981098962
[1504] 1547 1422.2175631786765
[2644] 2140 2127.913722632544
[1266] 1288 1274.8880140997112
[2789] 1915 2217.6733218613253

<matplotlib.legend.Legend at 0x7a34eb54c090>

#calculate the MSE and RMSE
print("MSE",mean_squared_error(y,y_pred))
print("RMSE",np.sqrt(mean_squared_error(y,y_pred)))

MSE 37793.42010270429
RMSE 194.40529854585827

# time to calculate r squared Tss and RSS
from sklearn.metrics import r2_score
print("R Squared", r2_score(y,y_pred))
print("TSS", np.sum((y-np.mean(y))**2))
print("RSS", np.sum((y-y_pred)**2))

R Squared 0.8547486695171209
TSS 13009664.0
RSS 1889671.0051352142

Symbol	scikit-learn	Description
( b_0 )	`intercept_`	Y-intercept (bias term)
( b_1 )	`coef_[0]`	Slope (per-unit change)
( x )	Input feature	e.g., area

Term	Meaning	Range	Symmetric	Based On Model?
Correlation (r)	Strength & direction of linear relation	[-1, 1]	Yes	No
R² Score	% variance explained by the model	[0, 1]	No	Yes

Linear Regression Essentials¶

Notebook Intro

Input Shapes¶

Feature Matrix (X)¶

Target Vector (y)¶

Linear Model Formula¶

Key Parameters¶

Model Evaluation Metrics¶

R² Score (Coefficient of Determination)¶

Additional Concepts¶

Residual Sum of Squares (RSS)¶

Total Sum of Squares (TSS)¶

Explained Variance¶

Correlation vs R²¶

In Simple Linear Regression:¶

Interpretation Tips¶

Summary:¶

Metrics Explanation¶

Easy alternative Explanation of TSS RSS and R Squared¶

	Area (sqft)	Bedrooms	Bathrooms	Parking	Location	Rent (USD)
0	2100	4	3	3	Rural	2144
1	910	2	2	0	Urban	897
2	2163	2	2	0	Urban	1574
3	2999	3	2	0	Urban	2360
4	1474	1	4	2	Urban	1494