import warnings warnings.filterwarnings("ignore", message="numpy.dtype size changed") import numpy as np import pandas as pd def load_data(): #df = pd.read_csv('data/sample.txt', delimiter=',', header=None) #df[2] = pd.Series([1]*len(df[0])) #df = pd.read_csv('data/ex1data1.txt', delimiter=',', header=None) #df[2] = pd.Series([1]*len(df[0])) df = pd.read_csv('data/train.csv', delimiter=',', comment='#') df[0] = df['x'] df[1] = df['y'] df[2] = pd.Series([1]*len(df[0])) df = df.reindex(columns=[1, 2, 0]) return df def feature_scaling(x): mean = x.mean(0) mean[0] = 0 std = x.std(0) std[std==0] = 1 std[0] = 1 x = (x-mean)/std return x, mean, std df = load_data() y = df.values[:, 0] x = df.values[:, 1:] m, n = x.shape w = np.zeros(n) a = 1.0e-3 x, mean, std = feature_scaling(x) sum_ = 0 gap = 0.001 break_gap = 1.0e-10 trycnt = 0 while True: h = x@w cost = ((h-y)**2).mean()/2 gradient = a*(h-y)@x/m w -= gradient sum_ += abs(gradient.sum()) if sum_ >= gap: print('[{}] {} ({})'.format(trycnt, cost, w)) sum_ -= gap trycnt += 1 if abs(gradient.max()) < break_gap: break if not np.isfinite(cost) or not np.isfinite(sum_): break print('[{}] {} ({})'.format(trycnt, cost, w)) #print('x: {}'.format(x[:10])) #print('y: {}'.format(y[:10])) #print('h: {}'.format(h[:10])) print(np.c_[x[:10], y[:10], h[:10]]) pass