diff options
| author | Charles <sircharlesaze@gmail.com> | 2020-01-25 13:06:10 +0100 |
|---|---|---|
| committer | Charles <sircharlesaze@gmail.com> | 2020-01-25 13:06:10 +0100 |
| commit | dea0f4cdec5bdf24962c8ab3ab2a6473e202259a (patch) | |
| tree | a2a703a50b2d744e103a657d50ea793743ce1ff5 /src | |
| parent | d5e51613d3582e18e858055cf4874507a0df452f (diff) | |
| download | dslr-dea0f4cdec5bdf24962c8ab3ab2a6473e202259a.tar.gz dslr-dea0f4cdec5bdf24962c8ab3ab2a6473e202259a.tar.bz2 dslr-dea0f4cdec5bdf24962c8ab3ab2a6473e202259a.zip | |
Custom statistics modulde, describe program
Diffstat (limited to 'src')
| -rw-r--r-- | src/analysis.py | 30 | ||||
| -rw-r--r-- | src/describe.py | 20 | ||||
| -rw-r--r-- | src/dslr_stat.py | 62 |
3 files changed, 105 insertions, 7 deletions
diff --git a/src/analysis.py b/src/analysis.py index 64ba100..abc0ffb 100644 --- a/src/analysis.py +++ b/src/analysis.py @@ -1,10 +1,26 @@ -class Analysis(Dataset): - def __init__(self, path): - self.dataset_path = path - super().__init__(path) +import numpy as np +import pandas as pd + +import dslr_stat - def describe(self): - for title in self.df. - pass +class Analysis: + def __init__(self, df): + self.df = df + def describe(self): + desc_df = pd.DataFrame( + dtype=np.float64, + columns=[c for c, t in zip(self.df.columns, self.df.dtypes) if t == np.float64], + index=['Count', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max'] + ) + for col in desc_df.columns: + desc_df.loc['Count', col] = len(self.df[col]) + desc_df.loc['Mean', col] = dslr_stat.mean(self.df[col]) + desc_df.loc['Std', col] = dslr_stat.std(self.df[col]) + desc_df.loc['Min', col] = dslr_stat.min(self.df[col]) + desc_df.loc['25%', col] = dslr_stat.q25(self.df[col]) + desc_df.loc['50%', col] = dslr_stat.median(self.df[col]) + desc_df.loc['75%', col] = dslr_stat.q75(self.df[col]) + desc_df.loc['Max', col] = dslr_stat.max(self.df[col]) + print(desc_df) diff --git a/src/describe.py b/src/describe.py index e69de29..7a968f1 100644 --- a/src/describe.py +++ b/src/describe.py @@ -0,0 +1,20 @@ +import sys + +import pandas as pd +from analysis import Analysis + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: {} dataset_path".format(sys.argv[0])) + sys.exit(1) + try: + df = pd.read_csv(sys.argv[1]) + except FileNotFoundError: + print("Could not find dataset at: {}".format(sys.argv[1])) + sys.exit(1) + df = df.loc[:, 'Arithmancy':'Flying'] + df.dropna(inplace=True) + a = Analysis(df) + a.describe() + print(df.describe()) diff --git a/src/dslr_stat.py b/src/dslr_stat.py new file mode 100644 index 0000000..91ad744 --- /dev/null +++ b/src/dslr_stat.py @@ -0,0 +1,62 @@ +import math + + +def _none_if_null_len(func): + def tmp(xs, *args, **kwargs): + if len(xs) == 0: + return None + return func(xs, *args, **kwargs) + return tmp + + +@_none_if_null_len +def mean(xs): + return sum(xs) / len(xs) + +@_none_if_null_len +def std(xs): + xs_mean = mean(xs) + return math.sqrt(sum( + [(x - xs_mean) ** 2 for x in xs]) / (len(xs) - 1)) + +@_none_if_null_len +def _pick(xs, compar): + m = xs[0] + for t in xs[1:]: + if compar(t, m): + m = t + return m + +def min(xs): + return _pick(xs, lambda x, y: x < y) + +def max(xs): + return _pick(xs, lambda x, y: x > y) + +def _qsort(xs): + if len(xs) < 2: + return xs + xs = list(xs) + pivot = xs[0] + body = xs[1:] + return (_qsort([x for x in body if x < pivot]) + + [pivot] + + _qsort([x for x in body if x >= pivot])) + +def _need_sorted(func): + return lambda xs, *args, **kwargs: func(_qsort(xs), *args, **kwargs) + +@_none_if_null_len +@_need_sorted +def q25(xs): + return xs[len(xs) // 4] + +@_none_if_null_len +@_need_sorted +def median(xs): + return xs[len(xs) // 2 ] + +@_none_if_null_len +@_need_sorted +def q75(xs): + return xs[3 * (len(xs) // 4)] |
