diff options
| author | Charles Cabergs <me@cacharle.xyz> | 2020-08-29 17:13:57 +0200 |
|---|---|---|
| committer | Charles Cabergs <me@cacharle.xyz> | 2020-08-29 17:13:57 +0200 |
| commit | a1c7fe1b6d95e2560e62c12453da287d36d4a714 (patch) | |
| tree | 72ce0f3a7dcf3627aba0b019665a23dcaec9f4f5 /src/describe.py | |
| parent | c766a4481526215057cac928d09d62319f290fe4 (diff) | |
| download | dslr-a1c7fe1b6d95e2560e62c12453da287d36d4a714.tar.gz dslr-a1c7fe1b6d95e2560e62c12453da287d36d4a714.tar.bz2 dslr-a1c7fe1b6d95e2560e62c12453da287d36d4a714.zip | |
Removed bloat, Added logreg_train
Diffstat (limited to 'src/describe.py')
| -rwxr-xr-x[-rw-r--r--] | src/describe.py | 27 |
1 files changed, 23 insertions, 4 deletions
diff --git a/src/describe.py b/src/describe.py index 4a3c5bc..3e54c64 100644..100755 --- a/src/describe.py +++ b/src/describe.py @@ -1,11 +1,30 @@ +#!/bin/python3 + import sys -from analysis import Analysis +import pandas as pd +import numpy as np + +from dataset import Dataset +import dslr_stat if __name__ == "__main__": if len(sys.argv) != 2: raise "Usage: {} dataset_path".format(sys.argv[0]) - a = Analysis(sys.argv[1]) - a.describe() - print(a.df_scores.describe()) + d = Dataset(sys.argv[1]) + desc_df = pd.DataFrame( + dtype=np.float64, + columns=[c for c, t in zip(d.df.columns, d.df.dtypes) if t == np.float64], + index=['Count', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max'] + ) + for col in desc_df.columns: + desc_df.loc['Count', col] = len(d.df[col]) + desc_df.loc['Mean', col] = dslr_stat.mean(d.df[col]) + desc_df.loc['Std', col] = dslr_stat.std(d.df[col]) + desc_df.loc['Min', col] = dslr_stat.min(d.df[col]) + desc_df.loc['25%', col] = dslr_stat.q25(d.df[col]) + desc_df.loc['50%', col] = dslr_stat.median(d.df[col]) + desc_df.loc['75%', col] = dslr_stat.q75(d.df[col]) + desc_df.loc['Max', col] = dslr_stat.max(d.df[col]) + print(desc_df) |
