aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/analysis.py49
-rw-r--r--src/cli.py29
-rwxr-xr-x[-rw-r--r--]src/dataset.py10
-rwxr-xr-x[-rw-r--r--]src/describe.py27
-rwxr-xr-xsrc/histogram.py28
-rwxr-xr-x[-rw-r--r--]src/logreg_predct.py11
-rwxr-xr-x[-rw-r--r--]src/logreg_train.py52
-rw-r--r--src/model.py69
-rwxr-xr-x[-rw-r--r--]src/pair_plot.py17
-rwxr-xr-x[-rw-r--r--]src/scatter_plot.py17
10 files changed, 141 insertions, 168 deletions
diff --git a/src/analysis.py b/src/analysis.py
deleted file mode 100644
index b6c9eb9..0000000
--- a/src/analysis.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-
-from dataset import Dataset
-import dslr_stat
-
-
-class Analysis(Dataset):
- def __init__(self, path):
- super().__init__(path)
-
- def describe(self):
- desc_df = pd.DataFrame(
- dtype=np.float64,
- columns=[c for c, t in zip(self.df.columns, self.df.dtypes) if t == np.float64],
- index=['Count', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max']
- )
- for col in desc_df.columns:
- desc_df.loc['Count', col] = len(self.df[col])
- desc_df.loc['Mean', col] = dslr_stat.mean(self.df[col])
- desc_df.loc['Std', col] = dslr_stat.std(self.df[col])
- desc_df.loc['Min', col] = dslr_stat.min(self.df[col])
- desc_df.loc['25%', col] = dslr_stat.q25(self.df[col])
- desc_df.loc['50%', col] = dslr_stat.median(self.df[col])
- desc_df.loc['75%', col] = dslr_stat.q75(self.df[col])
- desc_df.loc['Max', col] = dslr_stat.max(self.df[col])
- print(desc_df)
-
- def hist(self):
- pass
-
- def scatter(self):
- plt.scatter(self.df['astronomy'], self.df['defense_against_the_dark_arts'])
- plt.show()
-
- def pair_plot(self):
- scores = self.df_scores
- fig, axis = plt.subplots(nrows=scores.shape[1],
- ncols=scores.shape[1])
- for i, col in enumerate(scores.columns):
- for j, pair_col in enumerate(scores.columns):
- ax = axis[i, j]
- if pair_col == col:
- ax.hist(scores)
- continue
- ax.scatter(scores[col], scores[pair_col])
- plt.tight_layout()
- plt.show()
diff --git a/src/cli.py b/src/cli.py
deleted file mode 100644
index ec1a324..0000000
--- a/src/cli.py
+++ /dev/null
@@ -1,29 +0,0 @@
-class CommandLineInterface:
- def __init__(self):
- pass
-
- def parse_args(self):
- parse = argparse.ArgumentParser(prog="dslr_cli",
- description="CLI for the dslr project")
- subparser = parser.add_subparsers(dest="subparser_name")
- parser_describe = subparsers.add_parser("describe",
- help="give useful information about a dataset")
- parser_describe.add_argument("path", help="path to the dataset")
- parser_describe.set_defaults(func=self._describe)
-
- self.args = parser.parse_args(sys.argv[1:])
-
- def exec_args(self):
- if self.args.subparser_name is None:
- print("{} --help for more information".format(sys.argv[0]))
- return
- self.args.func()
-
- def _describe(self):
- describe.describe(self.args.path)
-
-
-if __name__ == "__main__":
- cli = CommandLineInterface()
- cli.parse_args()
- cli.exec_args()
diff --git a/src/dataset.py b/src/dataset.py
index 650d334..e9f4b44 100644..100755
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -1,3 +1,7 @@
+#!/bin/python3
+
+import sys
+
import pandas as pd
@@ -9,13 +13,19 @@ class Dataset:
except FileNotFoundError:
raise "Couldn't find dataset at: {}".format(path)
self.df.drop(columns=['Index'], inplace=True)
+ self.df.dropna(axis=1, how="all", inplace=True)
self.df.dropna(inplace=True)
self.df.columns = self.df.columns.str.lower()
self.df.columns = self.df.columns.str.replace(' ', '_')
self.df.rename(columns={'hogwarts_house': 'house'}, inplace=True)
+ self.df.rename(columns={'care_of_magical_creatures': 'magical_creatures'}, inplace=True)
+ self.df.rename(columns={'defense_against_the_dark_arts': 'defense_dark_arts'}, inplace=True)
@property
def df_scores(self):
return self.df.loc[:, 'arithmancy':'flying']
+if __name__ == "__main__":
+ d = Dataset(sys.argv[1])
+ print(d.df)
diff --git a/src/describe.py b/src/describe.py
index 4a3c5bc..3e54c64 100644..100755
--- a/src/describe.py
+++ b/src/describe.py
@@ -1,11 +1,30 @@
+#!/bin/python3
+
import sys
-from analysis import Analysis
+import pandas as pd
+import numpy as np
+
+from dataset import Dataset
+import dslr_stat
if __name__ == "__main__":
if len(sys.argv) != 2:
raise "Usage: {} dataset_path".format(sys.argv[0])
- a = Analysis(sys.argv[1])
- a.describe()
- print(a.df_scores.describe())
+ d = Dataset(sys.argv[1])
+ desc_df = pd.DataFrame(
+ dtype=np.float64,
+ columns=[c for c, t in zip(d.df.columns, d.df.dtypes) if t == np.float64],
+ index=['Count', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max']
+ )
+ for col in desc_df.columns:
+ desc_df.loc['Count', col] = len(d.df[col])
+ desc_df.loc['Mean', col] = dslr_stat.mean(d.df[col])
+ desc_df.loc['Std', col] = dslr_stat.std(d.df[col])
+ desc_df.loc['Min', col] = dslr_stat.min(d.df[col])
+ desc_df.loc['25%', col] = dslr_stat.q25(d.df[col])
+ desc_df.loc['50%', col] = dslr_stat.median(d.df[col])
+ desc_df.loc['75%', col] = dslr_stat.q75(d.df[col])
+ desc_df.loc['Max', col] = dslr_stat.max(d.df[col])
+ print(desc_df)
diff --git a/src/histogram.py b/src/histogram.py
new file mode 100755
index 0000000..1200233
--- /dev/null
+++ b/src/histogram.py
@@ -0,0 +1,28 @@
+#!/bin/python3
+
+import sys
+
+import matplotlib.pyplot as plt
+
+from dataset import Dataset
+
+
+def house_hist(ax, d, house_name):
+ h = d.df[d.df["house"] == house_name]
+ scores = h.loc[:, "arithmancy":"flying"]
+ x = (scores - scores.min()) / (scores.max() - scores.min())
+ ax.hist(x.values.flatten(), bins=40, rwidth=0.8)
+ ax.set_title(house_name)
+
+if __name__ == "__main__":
+ if len(sys.argv) != 2:
+ raise "Usage: {} dataset_path".format(sys.argv[0])
+ d = Dataset(sys.argv[1])
+
+ fig, axs = plt.subplots(2, 2, sharey=True, tight_layout=True)
+ house_hist(axs[0][0], d, "Gryffindor")
+ house_hist(axs[0][1], d, "Slytherin")
+ house_hist(axs[1][0], d, "Ravenclaw")
+ house_hist(axs[1][1], d, "Hufflepuff")
+ plt.show()
+
diff --git a/src/logreg_predct.py b/src/logreg_predct.py
index ae416b0..0f09c31 100644..100755
--- a/src/logreg_predct.py
+++ b/src/logreg_predct.py
@@ -1,11 +1,8 @@
-import sys
-
-from model import Model
+#!/bin/python3
+import sys
if __name__ == '__main__':
- if len(sys.argv) != 3:
- raise 'Usage: {} dataset_path weights_path'.path(*sys.argv[1:])
- m = Model()
- m.predict()
+ if len(sys.argv) != 2:
+ raise 'Usage: {} dataset_path'.path(sys.argv[0])
diff --git a/src/logreg_train.py b/src/logreg_train.py
index 8bc9a25..e02e101 100644..100755
--- a/src/logreg_train.py
+++ b/src/logreg_train.py
@@ -1,11 +1,55 @@
+#!/bin/python3
+
import sys
-from model import Model
+import pandas as pd
+import numpy as np
+
+from dataset import Dataset
+
+
+def sigmoid(x):
+ return 1.0 / (1.0 * np.exp(-x))
+
+def hypothesis(x, theta):
+ return sigmoid(x.dot(theta))
+def gradient(ys, xs, theta):
+ g = np.zeros(len(xs[0]))
+ for j in range(len(theta)):
+ g[j] = sum([(hypothesis(x, theta) - y) * x[j] for y, x in zip(ys, xs)]) / len(xs)
+ return g
+
+def gradient_descent(ys, xs, alpha, epoch):
+ theta = np.random.randn(len(xs[0]))
+ for i in range(epoch):
+ print("Gradient descent: {:02}%\r".format(int((i / epoch) * 100.0)), end="")
+ theta = theta - alpha * gradient(ys, xs, theta)
+ return theta
+
+def train(ys, xs):
+ thetas = []
+ # print(np.unique(ys))
+ for trained in np.unique(ys):
+ print(f"Trainning against {trained}")
+ ys_ally = ys.copy()
+ ys_ally[ys == trained] = 0 # opposite?
+ ys_ally[ys != trained] = 1
+ thetas.append((trained, gradient_descent(ys_ally, xs, 1, 2)))
+ return thetas
if __name__ == '__main__':
if len(sys.argv) != 2:
raise 'Usage: {} dataset_path'.format(sys.argv[0])
- m = Model()
- m.train()
- # write
+ d = Dataset(sys.argv[1])
+
+ X = d.df_scores.values
+ X = np.hstack([X, np.ones((X.shape[0], 1))])
+ X = (X - X.min()) / (X.max() - X.min())
+ Y = d.df["house"].values
+
+ thetas = train(Y, X)
+
+ with open("weights", "w") as f:
+ for name, t in thetas:
+ f.write("{}: {}\n".format(name, ','.join([str(x) for x in t])))
diff --git a/src/model.py b/src/model.py
deleted file mode 100644
index 47a660c..0000000
--- a/src/model.py
+++ /dev/null
@@ -1,69 +0,0 @@
-class Model:
- def __init__(self, weights_filename='weights'):
- self.weights_filename = weights_filename
-
- def train(self, xs, ys, alpha=1, epoch=1000):
- for _ in range(epoch):
- theta = theta - alpha * self.gradient(xs, ys)
-
- def train_against(self, xs, ys, theta, one, alpha, epoch):
- ys_ally = ys.copy()
- ys_ally[ys == one] = 0
- ys_ally[ys != one] = 1
- return gradient_descent(xs, ys_ally, theta, alpha, epoch)
-
- def train_thetas(xs, ys, theta, alpha=1, epoch=1000):
- thetas = []
- for i in np.unique(ys):
- thetas.append(train_against(xs, ys, theta, i, alpha, epoch))
- return thetas
-
- def gradient(self, xs, ys):
- return np.array([self.partial(xs, ys, i) for i in range(len(self.theta))])
-
- def partial(self, xs, ys, theta_j):
- total = 0
- for x_i, y_i in zip(xs, ys):
- temp = self.hypothesis(x_i) - y_i
- if theta_j != 0:
- temp *= x_i[theta_j - 1]
- total += temp
- return total / len(xs)
-
- def predict(self, x):
- return 1 if self.hypothesis(x) >= 0.5 else 0
-
- def hypothesis(self, x):
- return self._sigmoid(x.dot(self.theta))
-
- def logloss(self, x, y):
- if y == 1:
- return -np.ln(self.hypothesis(x))
- elif y == 0:
- return -np.ln(1 - self.hypothesis(x))
- else:
- raise "y != 1 and y != 0"
-
- def cost(self, xs, ys):
- return sum([self.logloss(x, y) for x, y in zip(xs, ys)]) / len(xs)
-
- def _sigmoid(self, x):
- return 1 / (1 + np.exp(-x))
-
- def _normalize(self, x):
- return (x - x.min()) / (x.max() - x.min())
-
- def _read_weights(self):
- try:
- with open(self.weights_filename, 'r') as file:
- self.weights = np.array(
- [float(s) for s in file.read().strip().split(',')])
- except IOError:
- raise 'Couldn\'t read weights file at: {}'.format(self.weights_filename)
-
- def _write_weights(self):
- try:
- with open(self.weights_filename, 'w') as file:
- file.write(','.join([str(w) for w in self.weights])
- except IOError:
- raise 'Couldn\'t write weights file at: {}'.format(self.weights_filename)
diff --git a/src/pair_plot.py b/src/pair_plot.py
index bf0c632..37821eb 100644..100755
--- a/src/pair_plot.py
+++ b/src/pair_plot.py
@@ -1,6 +1,17 @@
-from analysis import Analysis
+#!/bin/python3
+
+import sys
+
+import pandas as pd
+import matplotlib.pyplot as plt
+
+from dataset import Dataset
if __name__ == '__main__':
- a = Analysis('../datasets/dataset_train.csv')
- a.pair_plot()
+ if len(sys.argv) != 2:
+ raise "Usage: {} dataset_path".format(sys.argv[0])
+ d = Dataset(sys.argv[1])
+ pd.plotting.scatter_matrix(d.df_scores, s=2, alpha=0.8)
+ plt.show()
+
diff --git a/src/scatter_plot.py b/src/scatter_plot.py
index 74e0384..cbb38b9 100644..100755
--- a/src/scatter_plot.py
+++ b/src/scatter_plot.py
@@ -1,6 +1,17 @@
-from analysis import Analysis
+#!/bin/python3
+
+import sys
+
+import matplotlib.pyplot as plt
+
+from dataset import Dataset
if __name__ == '__main__':
- a = Analysis('../datasets/dataset_train.csv')
- a.scatter()
+ if len(sys.argv) != 2:
+ raise "Usage: {} dataset_path".format(sys.argv[0])
+ d = Dataset(sys.argv[1])
+ plt.scatter(d.df['astronomy'], d.df['defense_dark_arts'], s=5)
+ plt.xlabel('astronomy')
+ plt.ylabel('defense_dark_arts')
+ plt.show()