From dea0f4cdec5bdf24962c8ab3ab2a6473e202259a Mon Sep 17 00:00:00 2001
From: Charles <sircharlesaze@gmail.com>
Date: Sat, 25 Jan 2020 13:06:10 +0100
Subject: Custom statistics modulde, describe program

---
 dslr_notebook.ipynb | 13 +++++++----
 src/analysis.py     | 30 ++++++++++++++++++++------
 src/describe.py     | 20 +++++++++++++++++
 src/dslr_stat.py    | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 114 insertions(+), 11 deletions(-)
 create mode 100644 src/dslr_stat.py

diff --git a/dslr_notebook.ipynb b/dslr_notebook.ipynb
index b226df7..929bb71 100644
--- a/dslr_notebook.ipynb
+++ b/dslr_notebook.ipynb
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 34,
    "metadata": {
     "scrolled": false
    },
@@ -19,10 +19,10 @@
     {
      "data": {
       "text/plain": [
-       "array(['Ravenclaw', 'Slytherin', 'Gryffindor', 'Hufflepuff'], dtype=object)"
+       "1251"
       ]
      },
-     "execution_count": 1,
+     "execution_count": 34,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -35,7 +35,12 @@
     "df = pd.read_csv(\"./datasets/dataset_train.csv\")\n",
     "df.drop(columns=[\"Index\"], inplace=True)\n",
     "df.dropna(inplace=True)\n",
-    "df['Hogwarts House'].unique()"
+    "df.columns = df.columns.str.lower()\n",
+    "df.columns = df.columns.str.replace(' ', '_')\n",
+    "df.rename(columns={'hogwarts_house': 'house'}, inplace=True)\n",
+    "df.describe()\n",
+    "df['arithmancy'][df['arithmancy'] == 48793.000000]\n",
+    "len(df['arithmancy'])"
    ]
   },
   {
diff --git a/src/analysis.py b/src/analysis.py
index 64ba100..abc0ffb 100644
--- a/src/analysis.py
+++ b/src/analysis.py
@@ -1,10 +1,26 @@
-class Analysis(Dataset):
-    def __init__(self, path):
-        self.dataset_path = path
-        super().__init__(path)
+import numpy as np
+import pandas as pd
+
+import dslr_stat
 
-    def describe(self):
-        for title in self.df.
-        pass
 
+class Analysis:
+    def __init__(self, df):
+        self.df = df
 
+    def describe(self):
+        desc_df = pd.DataFrame(
+            dtype=np.float64,
+            columns=[c for c, t in zip(self.df.columns, self.df.dtypes) if t == np.float64],
+            index=['Count', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max']
+        )
+        for col in desc_df.columns:
+            desc_df.loc['Count', col] = len(self.df[col])
+            desc_df.loc['Mean', col] = dslr_stat.mean(self.df[col])
+            desc_df.loc['Std', col] = dslr_stat.std(self.df[col])
+            desc_df.loc['Min', col] = dslr_stat.min(self.df[col])
+            desc_df.loc['25%', col] = dslr_stat.q25(self.df[col])
+            desc_df.loc['50%', col] = dslr_stat.median(self.df[col])
+            desc_df.loc['75%', col] = dslr_stat.q75(self.df[col])
+            desc_df.loc['Max', col] = dslr_stat.max(self.df[col])
+        print(desc_df)
diff --git a/src/describe.py b/src/describe.py
index e69de29..7a968f1 100644
--- a/src/describe.py
+++ b/src/describe.py
@@ -0,0 +1,20 @@
+import sys
+
+import pandas as pd
+from analysis import Analysis
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: {} dataset_path".format(sys.argv[0]))
+        sys.exit(1)
+    try:
+        df = pd.read_csv(sys.argv[1])
+    except FileNotFoundError:
+        print("Could not find dataset at: {}".format(sys.argv[1]))
+        sys.exit(1)
+    df = df.loc[:, 'Arithmancy':'Flying']
+    df.dropna(inplace=True)
+    a = Analysis(df)
+    a.describe()
+    print(df.describe())
diff --git a/src/dslr_stat.py b/src/dslr_stat.py
new file mode 100644
index 0000000..91ad744
--- /dev/null
+++ b/src/dslr_stat.py
@@ -0,0 +1,62 @@
+import math
+
+
+def _none_if_null_len(func):
+    def tmp(xs, *args, **kwargs):
+        if len(xs) == 0:
+            return None
+        return func(xs, *args, **kwargs)
+    return tmp
+
+
+@_none_if_null_len
+def mean(xs):
+    return sum(xs) / len(xs)
+
+@_none_if_null_len
+def std(xs):
+    xs_mean = mean(xs)
+    return math.sqrt(sum(
+        [(x - xs_mean) ** 2 for x in xs]) / (len(xs) - 1))
+
+@_none_if_null_len
+def _pick(xs, compar):
+    m = xs[0]
+    for t in xs[1:]:
+        if compar(t, m):
+            m = t
+    return m
+
+def min(xs):
+    return _pick(xs, lambda x, y: x < y)
+
+def max(xs):
+    return _pick(xs, lambda x, y: x > y)
+
+def _qsort(xs):
+    if len(xs) < 2:
+        return xs
+    xs = list(xs)
+    pivot = xs[0]
+    body = xs[1:]
+    return (_qsort([x for x in body if x < pivot])
+            + [pivot]
+            + _qsort([x for x in body if x >= pivot]))
+
+def _need_sorted(func):
+    return lambda xs, *args, **kwargs: func(_qsort(xs), *args, **kwargs)
+
+@_none_if_null_len
+@_need_sorted
+def q25(xs):
+    return xs[len(xs) // 4]
+
+@_none_if_null_len
+@_need_sorted
+def median(xs):
+    return xs[len(xs) // 2 ]
+
+@_none_if_null_len
+@_need_sorted
+def q75(xs):
+    return xs[3 * (len(xs) // 4)]
-- 
cgit