Custom statistics modulde, describe program

author: Charles <sircharlesaze@gmail.com> 2020-01-25 13:06:10 +0100
committer: Charles <sircharlesaze@gmail.com> 2020-01-25 13:06:10 +0100
commit: dea0f4cdec5bdf24962c8ab3ab2a6473e202259a (patch)
tree: a2a703a50b2d744e103a657d50ea793743ce1ff5 /src
parent: d5e51613d3582e18e858055cf4874507a0df452f (diff)
download: dslr-dea0f4cdec5bdf24962c8ab3ab2a6473e202259a.tar.gz
dslr-dea0f4cdec5bdf24962c8ab3ab2a6473e202259a.tar.bz2
dslr-dea0f4cdec5bdf24962c8ab3ab2a6473e202259a.zip
3 files changed, 105 insertions, 7 deletions
diff --git a/src/analysis.py b/src/analysis.py
index 64ba100..abc0ffb 100644
--- a/src/analysis.py
+++ b/src/analysis.py
@@ -1,10 +1,26 @@
-class Analysis(Dataset):
-    def __init__(self, path):
-        self.dataset_path = path
-        super().__init__(path)
+import numpy as np
+import pandas as pd
+
+import dslr_stat
 
-    def describe(self):
-        for title in self.df.
-        pass
 
+class Analysis:
+    def __init__(self, df):
+        self.df = df
 
+    def describe(self):
+        desc_df = pd.DataFrame(
+            dtype=np.float64,
+            columns=[c for c, t in zip(self.df.columns, self.df.dtypes) if t == np.float64],
+            index=['Count', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max']
+        )
+        for col in desc_df.columns:
+            desc_df.loc['Count', col] = len(self.df[col])
+            desc_df.loc['Mean', col] = dslr_stat.mean(self.df[col])
+            desc_df.loc['Std', col] = dslr_stat.std(self.df[col])
+            desc_df.loc['Min', col] = dslr_stat.min(self.df[col])
+            desc_df.loc['25%', col] = dslr_stat.q25(self.df[col])
+            desc_df.loc['50%', col] = dslr_stat.median(self.df[col])
+            desc_df.loc['75%', col] = dslr_stat.q75(self.df[col])
+            desc_df.loc['Max', col] = dslr_stat.max(self.df[col])
+        print(desc_df)
diff --git a/src/describe.py b/src/describe.py
index e69de29..7a968f1 100644
--- a/src/describe.py
+++ b/src/describe.py
@@ -0,0 +1,20 @@
+import sys
+
+import pandas as pd
+from analysis import Analysis
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: {} dataset_path".format(sys.argv[0]))
+        sys.exit(1)
+    try:
+        df = pd.read_csv(sys.argv[1])
+    except FileNotFoundError:
+        print("Could not find dataset at: {}".format(sys.argv[1]))
+        sys.exit(1)
+    df = df.loc[:, 'Arithmancy':'Flying']
+    df.dropna(inplace=True)
+    a = Analysis(df)
+    a.describe()
+    print(df.describe())
diff --git a/src/dslr_stat.py b/src/dslr_stat.py
new file mode 100644
index 0000000..91ad744
--- /dev/null
+++ b/src/dslr_stat.py
@@ -0,0 +1,62 @@
+import math
+
+
+def _none_if_null_len(func):
+    def tmp(xs, *args, **kwargs):
+        if len(xs) == 0:
+            return None
+        return func(xs, *args, **kwargs)
+    return tmp
+
+
+@_none_if_null_len
+def mean(xs):
+    return sum(xs) / len(xs)
+
+@_none_if_null_len
+def std(xs):
+    xs_mean = mean(xs)
+    return math.sqrt(sum(
+        [(x - xs_mean) ** 2 for x in xs]) / (len(xs) - 1))
+
+@_none_if_null_len
+def _pick(xs, compar):
+    m = xs[0]
+    for t in xs[1:]:
+        if compar(t, m):
+            m = t
+    return m
+
+def min(xs):
+    return _pick(xs, lambda x, y: x < y)
+
+def max(xs):
+    return _pick(xs, lambda x, y: x > y)
+
+def _qsort(xs):
+    if len(xs) < 2:
+        return xs
+    xs = list(xs)
+    pivot = xs[0]
+    body = xs[1:]
+    return (_qsort([x for x in body if x < pivot])
+            + [pivot]
+            + _qsort([x for x in body if x >= pivot]))
+
+def _need_sorted(func):
+    return lambda xs, *args, **kwargs: func(_qsort(xs), *args, **kwargs)
+
+@_none_if_null_len
+@_need_sorted
+def q25(xs):
+    return xs[len(xs) // 4]
+
+@_none_if_null_len
+@_need_sorted
+def median(xs):
+    return xs[len(xs) // 2 ]
+
+@_none_if_null_len
+@_need_sorted
+def q75(xs):
+    return xs[3 * (len(xs) // 4)]
author	Charles <sircharlesaze@gmail.com>	2020-01-25 13:06:10 +0100
committer	Charles <sircharlesaze@gmail.com>	2020-01-25 13:06:10 +0100
commit	dea0f4cdec5bdf24962c8ab3ab2a6473e202259a (patch)
tree	a2a703a50b2d744e103a657d50ea793743ce1ff5 /src
parent	d5e51613d3582e18e858055cf4874507a0df452f (diff)
download	dslr-dea0f4cdec5bdf24962c8ab3ab2a6473e202259a.tar.gz dslr-dea0f4cdec5bdf24962c8ab3ab2a6473e202259a.tar.bz2 dslr-dea0f4cdec5bdf24962c8ab3ab2a6473e202259a.zip