work1

%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Exercise 1: Linear Regression with 1 variable

1.Plotting the data

data=pd.read_csv('ex1data1.txt',names=['Population','Profit'],header=None)
x=data.loc[:,'Population']
x
y=data.loc[:,'Profit']
fig=plt.figure()
axe=fig.add_axes([0.1,0.1,0.8,0.8])
axe.set_xlabel('Population')
axe.set_ylabel('Profits')
axe.set_title('the scatter of PP')
axe.scatter(x,y)
x1=x
y1=y
m=len(data)
x=pd.DataFrame(x)
x.insert(0,'Ones',1)
x

2.Gradient descent

def hypothe(x,theta):
    return x.dot(theta)
def cost(u,y):
    s=len(y)
    return 1/(2*s)*sum((u-y)*(u-y))
theta=np.zeros(2)
x
theta
iterations=1500
alpha=0.01
i=1
cost_com=np.empty(0)
while i<iterations:
    for j in range(len(theta)):
        theta[j]-=alpha*(hypothe(x,theta)-y).dot(x.iloc[:,j])/m
    cost_com=np.hstack((cost_com,cost(hypothe(x,theta),y)))
    i+=1
cost_com
theta
Ones Population
0 1 6.1101
1 1 5.5277
2 1 8.5186
3 1 7.0032
4 1 5.8598
... ... ...
92 1 5.8707
93 1 5.3054
94 1 8.2934
95 1 13.3940
96 1 5.4369

97 rows × 2 columns

3.Visualizing J

fig=plt.figure()
axe=fig.add_axes([0.1,0.1,1.,1.])
axe.set_title('Cost function')
axe.set_xlabel('the cost')
axe.set_ylabel('the time of iterations')
axe.plot(cost_com)
cost(hypothe(x,theta),y)
4.483134519095647
tx=np.vstack((np.ones(1000),np.linspace(5,30,1000))).T
fig=plt.figure()
axe=fig.add_axes([0.1,0.1,1.,1.])
axe.set_title('the Result of Linear Regression')
axe.set_xlabel('the population')
axe.set_ylabel('the porfit')
axe.plot(np.linspace(5,30,1000),tx.dot(theta),'r')
axe.scatter(x1,y1)

Exercise 2: Linear Regression with multiple variables

1.Feature normalization

data=pd.read_csv('ex1data2.txt',names=['sizes','bedrooms','price'],header=None)
data.head()
data.describe()
mean_std=pd.DataFrame([],columns=['mean','std'],index=['sizes','bedrooms'])
for i in range(2):
    mean_std.iloc[i,:]=[data.iloc[:,i].mean(),data.iloc[:,i].std()]
    data.iloc[:,i]=(data.iloc[:,i]-data.iloc[:,i].mean())/data.iloc[:,i].std()
mean_std
m=len(X)
sizes bedrooms price
0 2104 3 399900
1 1600 3 329900
2 2400 3 369000
3 1416 2 232000
4 3000 4 539900
sizes bedrooms price
count 47.000000 47.000000 47.000000
mean 2000.680851 3.170213 340412.659574
std 794.702354 0.760982 125039.899586
min 852.000000 1.000000 169900.000000
25% 1432.000000 3.000000 249900.000000
50% 1888.000000 3.000000 299900.000000
75% 2269.000000 4.000000 384450.000000
max 4478.000000 5.000000 699900.000000
mean std
sizes 2000.68 794.702
bedrooms 3.17021 0.760982
X=np.hstack((np.ones((m,1)),data.iloc[:,:-1]))
y=data.iloc[:,-1]
theta=np.zeros(3)
n=len(theta)
def hypothe_multi(theta,X):
    return X.dot(theta)
def cost_multi(theta,X,y):
    s=len(y)
    return 1/(2*s)*(X.dot(theta)-y).dot(X.dot(theta)-y)
iterations=1500
alpha=0.01
i=1
cost_com_multi=np.empty(0)
while i<iterations:
    for j in range(n):
        theta[j]-=alpha*(hypothe_multi(theta,X)-y).dot(X[:,j])/m
    cost_com_multi=np.hstack((cost_com_multi,cost_multi(theta,X,y)))
    i=i+1
fig=plt.figure()
axe=fig.add_axes([0.1,0.1,0.8,0.8])
axe.set_xlabel('the cost')
axe.set_ylabel('the time of iterations')
axe.set_title('the multi-variable linear regression')
axe.plot(cost_com_multi)
theta
array([340412.56203904, 110541.98853995,  -6560.60505947])
上一篇:最喜欢的一幅图——致祖国改革开放40周年


下一篇:信息检索中的非负矩阵分解