aboutsummaryrefslogtreecommitdiff
path: root/src/describe.py
blob: 3e54c644b20635a71f489c13ca8151e20456febd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#!/bin/python3

import sys

import pandas as pd
import numpy as np

from dataset import Dataset
import dslr_stat


if __name__ == "__main__":
    if len(sys.argv) != 2:
        raise "Usage: {} dataset_path".format(sys.argv[0])
    d = Dataset(sys.argv[1])
    desc_df = pd.DataFrame(
        dtype=np.float64,
        columns=[c for c, t in zip(d.df.columns, d.df.dtypes) if t == np.float64],
        index=['Count', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max']
    )
    for col in desc_df.columns:
        desc_df.loc['Count', col] = len(d.df[col])
        desc_df.loc['Mean', col] = dslr_stat.mean(d.df[col])
        desc_df.loc['Std', col] = dslr_stat.std(d.df[col])
        desc_df.loc['Min', col] = dslr_stat.min(d.df[col])
        desc_df.loc['25%', col] = dslr_stat.q25(d.df[col])
        desc_df.loc['50%', col] = dslr_stat.median(d.df[col])
        desc_df.loc['75%', col] = dslr_stat.q75(d.df[col])
        desc_df.loc['Max', col] = dslr_stat.max(d.df[col])
    print(desc_df)