From dea0f4cdec5bdf24962c8ab3ab2a6473e202259a Mon Sep 17 00:00:00 2001 From: Charles Date: Sat, 25 Jan 2020 13:06:10 +0100 Subject: Custom statistics modulde, describe program --- dslr_notebook.ipynb | 13 +++++++---- src/analysis.py | 30 ++++++++++++++++++++------ src/describe.py | 20 +++++++++++++++++ src/dslr_stat.py | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 114 insertions(+), 11 deletions(-) create mode 100644 src/dslr_stat.py diff --git a/dslr_notebook.ipynb b/dslr_notebook.ipynb index b226df7..929bb71 100644 --- a/dslr_notebook.ipynb +++ b/dslr_notebook.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 34, "metadata": { "scrolled": false }, @@ -19,10 +19,10 @@ { "data": { "text/plain": [ - "array(['Ravenclaw', 'Slytherin', 'Gryffindor', 'Hufflepuff'], dtype=object)" + "1251" ] }, - "execution_count": 1, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -35,7 +35,12 @@ "df = pd.read_csv(\"./datasets/dataset_train.csv\")\n", "df.drop(columns=[\"Index\"], inplace=True)\n", "df.dropna(inplace=True)\n", - "df['Hogwarts House'].unique()" + "df.columns = df.columns.str.lower()\n", + "df.columns = df.columns.str.replace(' ', '_')\n", + "df.rename(columns={'hogwarts_house': 'house'}, inplace=True)\n", + "df.describe()\n", + "df['arithmancy'][df['arithmancy'] == 48793.000000]\n", + "len(df['arithmancy'])" ] }, { diff --git a/src/analysis.py b/src/analysis.py index 64ba100..abc0ffb 100644 --- a/src/analysis.py +++ b/src/analysis.py @@ -1,10 +1,26 @@ -class Analysis(Dataset): - def __init__(self, path): - self.dataset_path = path - super().__init__(path) +import numpy as np +import pandas as pd + +import dslr_stat - def describe(self): - for title in self.df. - pass +class Analysis: + def __init__(self, df): + self.df = df + def describe(self): + desc_df = pd.DataFrame( + dtype=np.float64, + columns=[c for c, t in zip(self.df.columns, self.df.dtypes) if t == np.float64], + index=['Count', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max'] + ) + for col in desc_df.columns: + desc_df.loc['Count', col] = len(self.df[col]) + desc_df.loc['Mean', col] = dslr_stat.mean(self.df[col]) + desc_df.loc['Std', col] = dslr_stat.std(self.df[col]) + desc_df.loc['Min', col] = dslr_stat.min(self.df[col]) + desc_df.loc['25%', col] = dslr_stat.q25(self.df[col]) + desc_df.loc['50%', col] = dslr_stat.median(self.df[col]) + desc_df.loc['75%', col] = dslr_stat.q75(self.df[col]) + desc_df.loc['Max', col] = dslr_stat.max(self.df[col]) + print(desc_df) diff --git a/src/describe.py b/src/describe.py index e69de29..7a968f1 100644 --- a/src/describe.py +++ b/src/describe.py @@ -0,0 +1,20 @@ +import sys + +import pandas as pd +from analysis import Analysis + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: {} dataset_path".format(sys.argv[0])) + sys.exit(1) + try: + df = pd.read_csv(sys.argv[1]) + except FileNotFoundError: + print("Could not find dataset at: {}".format(sys.argv[1])) + sys.exit(1) + df = df.loc[:, 'Arithmancy':'Flying'] + df.dropna(inplace=True) + a = Analysis(df) + a.describe() + print(df.describe()) diff --git a/src/dslr_stat.py b/src/dslr_stat.py new file mode 100644 index 0000000..91ad744 --- /dev/null +++ b/src/dslr_stat.py @@ -0,0 +1,62 @@ +import math + + +def _none_if_null_len(func): + def tmp(xs, *args, **kwargs): + if len(xs) == 0: + return None + return func(xs, *args, **kwargs) + return tmp + + +@_none_if_null_len +def mean(xs): + return sum(xs) / len(xs) + +@_none_if_null_len +def std(xs): + xs_mean = mean(xs) + return math.sqrt(sum( + [(x - xs_mean) ** 2 for x in xs]) / (len(xs) - 1)) + +@_none_if_null_len +def _pick(xs, compar): + m = xs[0] + for t in xs[1:]: + if compar(t, m): + m = t + return m + +def min(xs): + return _pick(xs, lambda x, y: x < y) + +def max(xs): + return _pick(xs, lambda x, y: x > y) + +def _qsort(xs): + if len(xs) < 2: + return xs + xs = list(xs) + pivot = xs[0] + body = xs[1:] + return (_qsort([x for x in body if x < pivot]) + + [pivot] + + _qsort([x for x in body if x >= pivot])) + +def _need_sorted(func): + return lambda xs, *args, **kwargs: func(_qsort(xs), *args, **kwargs) + +@_none_if_null_len +@_need_sorted +def q25(xs): + return xs[len(xs) // 4] + +@_none_if_null_len +@_need_sorted +def median(xs): + return xs[len(xs) // 2 ] + +@_none_if_null_len +@_need_sorted +def q75(xs): + return xs[3 * (len(xs) // 4)] -- cgit