aboutsummaryrefslogtreecommitdiff
path: root/src/describe.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/describe.py')
-rwxr-xr-x[-rw-r--r--]src/describe.py27
1 files changed, 23 insertions, 4 deletions
diff --git a/src/describe.py b/src/describe.py
index 4a3c5bc..3e54c64 100644..100755
--- a/src/describe.py
+++ b/src/describe.py
@@ -1,11 +1,30 @@
+#!/bin/python3
+
import sys
-from analysis import Analysis
+import pandas as pd
+import numpy as np
+
+from dataset import Dataset
+import dslr_stat
if __name__ == "__main__":
if len(sys.argv) != 2:
raise "Usage: {} dataset_path".format(sys.argv[0])
- a = Analysis(sys.argv[1])
- a.describe()
- print(a.df_scores.describe())
+ d = Dataset(sys.argv[1])
+ desc_df = pd.DataFrame(
+ dtype=np.float64,
+ columns=[c for c, t in zip(d.df.columns, d.df.dtypes) if t == np.float64],
+ index=['Count', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max']
+ )
+ for col in desc_df.columns:
+ desc_df.loc['Count', col] = len(d.df[col])
+ desc_df.loc['Mean', col] = dslr_stat.mean(d.df[col])
+ desc_df.loc['Std', col] = dslr_stat.std(d.df[col])
+ desc_df.loc['Min', col] = dslr_stat.min(d.df[col])
+ desc_df.loc['25%', col] = dslr_stat.q25(d.df[col])
+ desc_df.loc['50%', col] = dslr_stat.median(d.df[col])
+ desc_df.loc['75%', col] = dslr_stat.q75(d.df[col])
+ desc_df.loc['Max', col] = dslr_stat.max(d.df[col])
+ print(desc_df)