From e3bf92dab20d504dd2b12e7f97a05157356d3b0b Mon Sep 17 00:00:00 2001
From: mjjo <myeongjin_jo@solomonsc.com>
Date: Mon, 13 Aug 2018 17:58:08 +0900
Subject: [PATCH] no message

---
 .gitignore     |  1 +
 regression2.py | 50 +++++++++++++++++++++++++++++++++++++-------------
 regressions.py | 12 ++++++------
 3 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/.gitignore b/.gitignore
index 88dbff1..c230a68 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 .vs/
+__pycache__/
diff --git a/regression2.py b/regression2.py
index 6ed81bc..74df33c 100644
--- a/regression2.py
+++ b/regression2.py
@@ -6,39 +6,63 @@ import pandas as pd
 
 
 def load_data():
-	df = pd.read_csv('data/sample.txt', sep=',', header=None)
+	df = pd.read_csv('data/sample.txt', delimiter=',', header=None)
+	df[2] = pd.Series([1]*len(df[0]))
+
+	#df = pd.read_csv('data/ex1data1.txt', delimiter=',', header=None)
+	#df[2] = pd.Series([1]*len(df[0]))
+
+	#df = pd.read_csv('data/train.csv', delimiter=',', comment='#')
+	#df[0] = df['x']
+	#df[1] = df['y']
 
 	df[2] = pd.Series([1]*len(df[0]))
 	df = df.reindex(columns=[1, 2, 0])
 
 	return df
 
-def get_cost():
-	
-	pass
-
-def get_gradient():
-	pass
-
+def feature_scaling(x):
+	mean = x.mean(0)
+	std = x.std(0)
+	std[std==0] = 1
+	x = (x-mean)/std
+	return x, mean, std
 
 
 df = load_data()
-
-
 y = df.values[:, 0]
 x = df.values[:, 1:]
 m, n = x.shape
 w = np.zeros(n)
-a = 0.001
+a = 1.0e-3
 
+x, mean, std = feature_scaling(x)
 
 sum_ = 0
 gap = 0.001
-
+break_gap = 1.0e-20
+trycnt = 0
 while True:
 	h = x@w
 	cost = ((h-y)**2).mean()/2
-	gradient = a*(h-y)@x
+	gradient = a*(h-y)@x/m
+	w -= gradient
 
+	sum_ += abs(gradient.sum())
+	if sum_ >= gap:
+		print('[{}] {} ({})'.format(trycnt, cost, w))
+		sum_ -= gap
 
+	trycnt += 1
+	if abs(gradient.max()) < break_gap:
+		break
+
+	if not np.isfinite(cost) or not np.isfinite(sum_):
+		break
+
+print('[{}] {} ({})'.format(trycnt, cost, w))
+#print('x: {}'.format(x[:10]))
+#print('y: {}'.format(y[:10]))
+#print('h: {}'.format(h[:10]))
+print(np.c_[x[:10], y[:10], h[:10]])
 pass
\ No newline at end of file
diff --git a/regressions.py b/regressions.py
index 0b90513..30fd30f 100644
--- a/regressions.py
+++ b/regressions.py
@@ -46,7 +46,7 @@ def get_derived_regularization_term(w: np.array, wlambda: float, alpha: float) -
 
 #data = np.loadtxt('data/ex1data1.txt', delimiter=',')
 # train excercize
-#data = np.loadtxt('data/train.csv', delimiter=',', skiprows=1, comments='#')
+data = np.loadtxt('data/train.csv', delimiter=',', skiprows=1, comments='#')
 # auto mpg
 #df = pd.read_csv('data/auto-mpg.data', sep='\t')
 #df = df.drop(df.loc[df['horsepower'] == '?'].index)
@@ -57,8 +57,8 @@ def get_derived_regularization_term(w: np.array, wlambda: float, alpha: float) -
 #df = df.convert_objects(convert_numeric=True)
 #data = df.values
 
-df = pd.read_csv('data/mlr01.csv', sep=',')
-data = df.values
+#df = pd.read_csv('data/mlr01.csv', sep=',')
+#data = df.values
 
 
 
@@ -76,7 +76,7 @@ is_2d = (m == 2)
 
 if is_2d == True:
 	g = graph.Graph()
-	g.draw_variable(x_init, y)
+	g.draw_variable(x[:, 1], y)
 
 alpha = 0.001
 
@@ -97,13 +97,13 @@ while True:
 		print("{} : {}".format(iter, cost))
 		
 		if is_2d == True:
-			g.draw_line(x_init, h)
+			g.draw_line(x[:, 1], h)
 
 	iter += 1
 
 	if max(abs(gradient)) < 1e-5:
 		if is_2d:
-			g.draw_line(x_init, h)
+			g.draw_line(x[:, 1], h)
 		break
 print('iteration: {}'.format(iter))