aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--dslr_notebook.ipynb13
-rw-r--r--src/analysis.py30
-rw-r--r--src/describe.py20
-rw-r--r--src/dslr_stat.py62
4 files changed, 114 insertions, 11 deletions
diff --git a/dslr_notebook.ipynb b/dslr_notebook.ipynb
index b226df7..929bb71 100644
--- a/dslr_notebook.ipynb
+++ b/dslr_notebook.ipynb
@@ -11,7 +11,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 34,
"metadata": {
"scrolled": false
},
@@ -19,10 +19,10 @@
{
"data": {
"text/plain": [
- "array(['Ravenclaw', 'Slytherin', 'Gryffindor', 'Hufflepuff'], dtype=object)"
+ "1251"
]
},
- "execution_count": 1,
+ "execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
@@ -35,7 +35,12 @@
"df = pd.read_csv(\"./datasets/dataset_train.csv\")\n",
"df.drop(columns=[\"Index\"], inplace=True)\n",
"df.dropna(inplace=True)\n",
- "df['Hogwarts House'].unique()"
+ "df.columns = df.columns.str.lower()\n",
+ "df.columns = df.columns.str.replace(' ', '_')\n",
+ "df.rename(columns={'hogwarts_house': 'house'}, inplace=True)\n",
+ "df.describe()\n",
+ "df['arithmancy'][df['arithmancy'] == 48793.000000]\n",
+ "len(df['arithmancy'])"
]
},
{
diff --git a/src/analysis.py b/src/analysis.py
index 64ba100..abc0ffb 100644
--- a/src/analysis.py
+++ b/src/analysis.py
@@ -1,10 +1,26 @@
-class Analysis(Dataset):
- def __init__(self, path):
- self.dataset_path = path
- super().__init__(path)
+import numpy as np
+import pandas as pd
+
+import dslr_stat
- def describe(self):
- for title in self.df.
- pass
+class Analysis:
+ def __init__(self, df):
+ self.df = df
+ def describe(self):
+ desc_df = pd.DataFrame(
+ dtype=np.float64,
+ columns=[c for c, t in zip(self.df.columns, self.df.dtypes) if t == np.float64],
+ index=['Count', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max']
+ )
+ for col in desc_df.columns:
+ desc_df.loc['Count', col] = len(self.df[col])
+ desc_df.loc['Mean', col] = dslr_stat.mean(self.df[col])
+ desc_df.loc['Std', col] = dslr_stat.std(self.df[col])
+ desc_df.loc['Min', col] = dslr_stat.min(self.df[col])
+ desc_df.loc['25%', col] = dslr_stat.q25(self.df[col])
+ desc_df.loc['50%', col] = dslr_stat.median(self.df[col])
+ desc_df.loc['75%', col] = dslr_stat.q75(self.df[col])
+ desc_df.loc['Max', col] = dslr_stat.max(self.df[col])
+ print(desc_df)
diff --git a/src/describe.py b/src/describe.py
index e69de29..7a968f1 100644
--- a/src/describe.py
+++ b/src/describe.py
@@ -0,0 +1,20 @@
+import sys
+
+import pandas as pd
+from analysis import Analysis
+
+
+if __name__ == "__main__":
+ if len(sys.argv) != 2:
+ print("Usage: {} dataset_path".format(sys.argv[0]))
+ sys.exit(1)
+ try:
+ df = pd.read_csv(sys.argv[1])
+ except FileNotFoundError:
+ print("Could not find dataset at: {}".format(sys.argv[1]))
+ sys.exit(1)
+ df = df.loc[:, 'Arithmancy':'Flying']
+ df.dropna(inplace=True)
+ a = Analysis(df)
+ a.describe()
+ print(df.describe())
diff --git a/src/dslr_stat.py b/src/dslr_stat.py
new file mode 100644
index 0000000..91ad744
--- /dev/null
+++ b/src/dslr_stat.py
@@ -0,0 +1,62 @@
+import math
+
+
+def _none_if_null_len(func):
+ def tmp(xs, *args, **kwargs):
+ if len(xs) == 0:
+ return None
+ return func(xs, *args, **kwargs)
+ return tmp
+
+
+@_none_if_null_len
+def mean(xs):
+ return sum(xs) / len(xs)
+
+@_none_if_null_len
+def std(xs):
+ xs_mean = mean(xs)
+ return math.sqrt(sum(
+ [(x - xs_mean) ** 2 for x in xs]) / (len(xs) - 1))
+
+@_none_if_null_len
+def _pick(xs, compar):
+ m = xs[0]
+ for t in xs[1:]:
+ if compar(t, m):
+ m = t
+ return m
+
+def min(xs):
+ return _pick(xs, lambda x, y: x < y)
+
+def max(xs):
+ return _pick(xs, lambda x, y: x > y)
+
+def _qsort(xs):
+ if len(xs) < 2:
+ return xs
+ xs = list(xs)
+ pivot = xs[0]
+ body = xs[1:]
+ return (_qsort([x for x in body if x < pivot])
+ + [pivot]
+ + _qsort([x for x in body if x >= pivot]))
+
+def _need_sorted(func):
+ return lambda xs, *args, **kwargs: func(_qsort(xs), *args, **kwargs)
+
+@_none_if_null_len
+@_need_sorted
+def q25(xs):
+ return xs[len(xs) // 4]
+
+@_none_if_null_len
+@_need_sorted
+def median(xs):
+ return xs[len(xs) // 2 ]
+
+@_none_if_null_len
+@_need_sorted
+def q75(xs):
+ return xs[3 * (len(xs) // 4)]