diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/analysis.py | 49 | ||||
| -rw-r--r-- | src/cli.py | 29 | ||||
| -rwxr-xr-x[-rw-r--r--] | src/dataset.py | 10 | ||||
| -rwxr-xr-x[-rw-r--r--] | src/describe.py | 27 | ||||
| -rwxr-xr-x | src/histogram.py | 28 | ||||
| -rwxr-xr-x[-rw-r--r--] | src/logreg_predct.py | 11 | ||||
| -rwxr-xr-x[-rw-r--r--] | src/logreg_train.py | 52 | ||||
| -rw-r--r-- | src/model.py | 69 | ||||
| -rwxr-xr-x[-rw-r--r--] | src/pair_plot.py | 17 | ||||
| -rwxr-xr-x[-rw-r--r--] | src/scatter_plot.py | 17 |
10 files changed, 141 insertions, 168 deletions
diff --git a/src/analysis.py b/src/analysis.py deleted file mode 100644 index b6c9eb9..0000000 --- a/src/analysis.py +++ /dev/null @@ -1,49 +0,0 @@ -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt - -from dataset import Dataset -import dslr_stat - - -class Analysis(Dataset): - def __init__(self, path): - super().__init__(path) - - def describe(self): - desc_df = pd.DataFrame( - dtype=np.float64, - columns=[c for c, t in zip(self.df.columns, self.df.dtypes) if t == np.float64], - index=['Count', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max'] - ) - for col in desc_df.columns: - desc_df.loc['Count', col] = len(self.df[col]) - desc_df.loc['Mean', col] = dslr_stat.mean(self.df[col]) - desc_df.loc['Std', col] = dslr_stat.std(self.df[col]) - desc_df.loc['Min', col] = dslr_stat.min(self.df[col]) - desc_df.loc['25%', col] = dslr_stat.q25(self.df[col]) - desc_df.loc['50%', col] = dslr_stat.median(self.df[col]) - desc_df.loc['75%', col] = dslr_stat.q75(self.df[col]) - desc_df.loc['Max', col] = dslr_stat.max(self.df[col]) - print(desc_df) - - def hist(self): - pass - - def scatter(self): - plt.scatter(self.df['astronomy'], self.df['defense_against_the_dark_arts']) - plt.show() - - def pair_plot(self): - scores = self.df_scores - fig, axis = plt.subplots(nrows=scores.shape[1], - ncols=scores.shape[1]) - for i, col in enumerate(scores.columns): - for j, pair_col in enumerate(scores.columns): - ax = axis[i, j] - if pair_col == col: - ax.hist(scores) - continue - ax.scatter(scores[col], scores[pair_col]) - plt.tight_layout() - plt.show() diff --git a/src/cli.py b/src/cli.py deleted file mode 100644 index ec1a324..0000000 --- a/src/cli.py +++ /dev/null @@ -1,29 +0,0 @@ -class CommandLineInterface: - def __init__(self): - pass - - def parse_args(self): - parse = argparse.ArgumentParser(prog="dslr_cli", - description="CLI for the dslr project") - subparser = parser.add_subparsers(dest="subparser_name") - parser_describe = subparsers.add_parser("describe", - help="give useful information about a dataset") - parser_describe.add_argument("path", help="path to the dataset") - parser_describe.set_defaults(func=self._describe) - - self.args = parser.parse_args(sys.argv[1:]) - - def exec_args(self): - if self.args.subparser_name is None: - print("{} --help for more information".format(sys.argv[0])) - return - self.args.func() - - def _describe(self): - describe.describe(self.args.path) - - -if __name__ == "__main__": - cli = CommandLineInterface() - cli.parse_args() - cli.exec_args() diff --git a/src/dataset.py b/src/dataset.py index 650d334..e9f4b44 100644..100755 --- a/src/dataset.py +++ b/src/dataset.py @@ -1,3 +1,7 @@ +#!/bin/python3 + +import sys + import pandas as pd @@ -9,13 +13,19 @@ class Dataset: except FileNotFoundError: raise "Couldn't find dataset at: {}".format(path) self.df.drop(columns=['Index'], inplace=True) + self.df.dropna(axis=1, how="all", inplace=True) self.df.dropna(inplace=True) self.df.columns = self.df.columns.str.lower() self.df.columns = self.df.columns.str.replace(' ', '_') self.df.rename(columns={'hogwarts_house': 'house'}, inplace=True) + self.df.rename(columns={'care_of_magical_creatures': 'magical_creatures'}, inplace=True) + self.df.rename(columns={'defense_against_the_dark_arts': 'defense_dark_arts'}, inplace=True) @property def df_scores(self): return self.df.loc[:, 'arithmancy':'flying'] +if __name__ == "__main__": + d = Dataset(sys.argv[1]) + print(d.df) diff --git a/src/describe.py b/src/describe.py index 4a3c5bc..3e54c64 100644..100755 --- a/src/describe.py +++ b/src/describe.py @@ -1,11 +1,30 @@ +#!/bin/python3 + import sys -from analysis import Analysis +import pandas as pd +import numpy as np + +from dataset import Dataset +import dslr_stat if __name__ == "__main__": if len(sys.argv) != 2: raise "Usage: {} dataset_path".format(sys.argv[0]) - a = Analysis(sys.argv[1]) - a.describe() - print(a.df_scores.describe()) + d = Dataset(sys.argv[1]) + desc_df = pd.DataFrame( + dtype=np.float64, + columns=[c for c, t in zip(d.df.columns, d.df.dtypes) if t == np.float64], + index=['Count', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max'] + ) + for col in desc_df.columns: + desc_df.loc['Count', col] = len(d.df[col]) + desc_df.loc['Mean', col] = dslr_stat.mean(d.df[col]) + desc_df.loc['Std', col] = dslr_stat.std(d.df[col]) + desc_df.loc['Min', col] = dslr_stat.min(d.df[col]) + desc_df.loc['25%', col] = dslr_stat.q25(d.df[col]) + desc_df.loc['50%', col] = dslr_stat.median(d.df[col]) + desc_df.loc['75%', col] = dslr_stat.q75(d.df[col]) + desc_df.loc['Max', col] = dslr_stat.max(d.df[col]) + print(desc_df) diff --git a/src/histogram.py b/src/histogram.py new file mode 100755 index 0000000..1200233 --- /dev/null +++ b/src/histogram.py @@ -0,0 +1,28 @@ +#!/bin/python3 + +import sys + +import matplotlib.pyplot as plt + +from dataset import Dataset + + +def house_hist(ax, d, house_name): + h = d.df[d.df["house"] == house_name] + scores = h.loc[:, "arithmancy":"flying"] + x = (scores - scores.min()) / (scores.max() - scores.min()) + ax.hist(x.values.flatten(), bins=40, rwidth=0.8) + ax.set_title(house_name) + +if __name__ == "__main__": + if len(sys.argv) != 2: + raise "Usage: {} dataset_path".format(sys.argv[0]) + d = Dataset(sys.argv[1]) + + fig, axs = plt.subplots(2, 2, sharey=True, tight_layout=True) + house_hist(axs[0][0], d, "Gryffindor") + house_hist(axs[0][1], d, "Slytherin") + house_hist(axs[1][0], d, "Ravenclaw") + house_hist(axs[1][1], d, "Hufflepuff") + plt.show() + diff --git a/src/logreg_predct.py b/src/logreg_predct.py index ae416b0..0f09c31 100644..100755 --- a/src/logreg_predct.py +++ b/src/logreg_predct.py @@ -1,11 +1,8 @@ -import sys - -from model import Model +#!/bin/python3 +import sys if __name__ == '__main__': - if len(sys.argv) != 3: - raise 'Usage: {} dataset_path weights_path'.path(*sys.argv[1:]) - m = Model() - m.predict() + if len(sys.argv) != 2: + raise 'Usage: {} dataset_path'.path(sys.argv[0]) diff --git a/src/logreg_train.py b/src/logreg_train.py index 8bc9a25..e02e101 100644..100755 --- a/src/logreg_train.py +++ b/src/logreg_train.py @@ -1,11 +1,55 @@ +#!/bin/python3 + import sys -from model import Model +import pandas as pd +import numpy as np + +from dataset import Dataset + + +def sigmoid(x): + return 1.0 / (1.0 * np.exp(-x)) + +def hypothesis(x, theta): + return sigmoid(x.dot(theta)) +def gradient(ys, xs, theta): + g = np.zeros(len(xs[0])) + for j in range(len(theta)): + g[j] = sum([(hypothesis(x, theta) - y) * x[j] for y, x in zip(ys, xs)]) / len(xs) + return g + +def gradient_descent(ys, xs, alpha, epoch): + theta = np.random.randn(len(xs[0])) + for i in range(epoch): + print("Gradient descent: {:02}%\r".format(int((i / epoch) * 100.0)), end="") + theta = theta - alpha * gradient(ys, xs, theta) + return theta + +def train(ys, xs): + thetas = [] + # print(np.unique(ys)) + for trained in np.unique(ys): + print(f"Trainning against {trained}") + ys_ally = ys.copy() + ys_ally[ys == trained] = 0 # opposite? + ys_ally[ys != trained] = 1 + thetas.append((trained, gradient_descent(ys_ally, xs, 1, 2))) + return thetas if __name__ == '__main__': if len(sys.argv) != 2: raise 'Usage: {} dataset_path'.format(sys.argv[0]) - m = Model() - m.train() - # write + d = Dataset(sys.argv[1]) + + X = d.df_scores.values + X = np.hstack([X, np.ones((X.shape[0], 1))]) + X = (X - X.min()) / (X.max() - X.min()) + Y = d.df["house"].values + + thetas = train(Y, X) + + with open("weights", "w") as f: + for name, t in thetas: + f.write("{}: {}\n".format(name, ','.join([str(x) for x in t]))) diff --git a/src/model.py b/src/model.py deleted file mode 100644 index 47a660c..0000000 --- a/src/model.py +++ /dev/null @@ -1,69 +0,0 @@ -class Model: - def __init__(self, weights_filename='weights'): - self.weights_filename = weights_filename - - def train(self, xs, ys, alpha=1, epoch=1000): - for _ in range(epoch): - theta = theta - alpha * self.gradient(xs, ys) - - def train_against(self, xs, ys, theta, one, alpha, epoch): - ys_ally = ys.copy() - ys_ally[ys == one] = 0 - ys_ally[ys != one] = 1 - return gradient_descent(xs, ys_ally, theta, alpha, epoch) - - def train_thetas(xs, ys, theta, alpha=1, epoch=1000): - thetas = [] - for i in np.unique(ys): - thetas.append(train_against(xs, ys, theta, i, alpha, epoch)) - return thetas - - def gradient(self, xs, ys): - return np.array([self.partial(xs, ys, i) for i in range(len(self.theta))]) - - def partial(self, xs, ys, theta_j): - total = 0 - for x_i, y_i in zip(xs, ys): - temp = self.hypothesis(x_i) - y_i - if theta_j != 0: - temp *= x_i[theta_j - 1] - total += temp - return total / len(xs) - - def predict(self, x): - return 1 if self.hypothesis(x) >= 0.5 else 0 - - def hypothesis(self, x): - return self._sigmoid(x.dot(self.theta)) - - def logloss(self, x, y): - if y == 1: - return -np.ln(self.hypothesis(x)) - elif y == 0: - return -np.ln(1 - self.hypothesis(x)) - else: - raise "y != 1 and y != 0" - - def cost(self, xs, ys): - return sum([self.logloss(x, y) for x, y in zip(xs, ys)]) / len(xs) - - def _sigmoid(self, x): - return 1 / (1 + np.exp(-x)) - - def _normalize(self, x): - return (x - x.min()) / (x.max() - x.min()) - - def _read_weights(self): - try: - with open(self.weights_filename, 'r') as file: - self.weights = np.array( - [float(s) for s in file.read().strip().split(',')]) - except IOError: - raise 'Couldn\'t read weights file at: {}'.format(self.weights_filename) - - def _write_weights(self): - try: - with open(self.weights_filename, 'w') as file: - file.write(','.join([str(w) for w in self.weights]) - except IOError: - raise 'Couldn\'t write weights file at: {}'.format(self.weights_filename) diff --git a/src/pair_plot.py b/src/pair_plot.py index bf0c632..37821eb 100644..100755 --- a/src/pair_plot.py +++ b/src/pair_plot.py @@ -1,6 +1,17 @@ -from analysis import Analysis +#!/bin/python3 + +import sys + +import pandas as pd +import matplotlib.pyplot as plt + +from dataset import Dataset if __name__ == '__main__': - a = Analysis('../datasets/dataset_train.csv') - a.pair_plot() + if len(sys.argv) != 2: + raise "Usage: {} dataset_path".format(sys.argv[0]) + d = Dataset(sys.argv[1]) + pd.plotting.scatter_matrix(d.df_scores, s=2, alpha=0.8) + plt.show() + diff --git a/src/scatter_plot.py b/src/scatter_plot.py index 74e0384..cbb38b9 100644..100755 --- a/src/scatter_plot.py +++ b/src/scatter_plot.py @@ -1,6 +1,17 @@ -from analysis import Analysis +#!/bin/python3 + +import sys + +import matplotlib.pyplot as plt + +from dataset import Dataset if __name__ == '__main__': - a = Analysis('../datasets/dataset_train.csv') - a.scatter() + if len(sys.argv) != 2: + raise "Usage: {} dataset_path".format(sys.argv[0]) + d = Dataset(sys.argv[1]) + plt.scatter(d.df['astronomy'], d.df['defense_dark_arts'], s=5) + plt.xlabel('astronomy') + plt.ylabel('defense_dark_arts') + plt.show() |
