forked from apache/superset
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataframe.py
112 lines (88 loc) · 3.1 KB
/
dataframe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
""" Superset wrapper around pandas.DataFrame.
TODO(bkyryliuk): add support for the conventions like: *_dim or dim_*
dimensions, *_ts, ts_*, ds_*, *_ds - datetime, etc.
TODO(bkyryliuk): recognize integer encoded enums.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import pandas as pd
import numpy as np
INFER_COL_TYPES_THRESHOLD = 95
INFER_COL_TYPES_SAMPLE_SIZE = 100
class SupersetDataFrame(object):
def __init__(self, df):
self.__df = df.where((pd.notnull(df)), None)
@property
def size(self):
return len(self.__df.index)
@property
def data(self):
return self.__df.to_dict(orient='records')
@property
def columns_dict(self):
"""Provides metadata about columns for data visualization.
:return: dict, with the fields name, type, is_date, is_dim and agg.
"""
if self.__df.empty:
return None
columns = []
sample_size = min(INFER_COL_TYPES_SAMPLE_SIZE, len(self.__df.index))
sample = self.__df
if sample_size:
sample = self.__df.sample(sample_size)
for col in self.__df.dtypes.keys():
column = {
'name': col,
'type': self.__df.dtypes[col].name,
'is_date': is_date(self.__df.dtypes[col]),
'is_dim': is_dimension(self.__df.dtypes[col], col),
}
agg = agg_func(self.__df.dtypes[col], col)
if agg_func:
column['agg'] = agg
if column['type'] == 'object':
# check if encoded datetime
if (datetime_conversion_rate(sample[col]) >
INFER_COL_TYPES_THRESHOLD):
column.update({
'type': 'datetime_string',
'is_date': True,
'is_dim': False,
'agg': None
})
# 'agg' is optional attribute
if not column['agg']:
column.pop('agg', None)
columns.append(column)
return columns
# It will give false positives on the numbers that are stored as strings.
# It is hard to distinguish integer numbers and timestamps
def datetime_conversion_rate(data_series):
success = 0
total = 0
for value in data_series:
total += 1
try:
pd.to_datetime(value)
success += 1
except Exception:
continue
return 100 * success / total
def is_date(dtype):
if dtype.name:
return dtype.name.startswith('datetime')
def is_dimension(dtype, column_name):
if is_id(column_name):
return False
return dtype.name in ('object', 'bool')
def is_id(column_name):
return column_name.startswith('id') or column_name.endswith('id')
def agg_func(dtype, column_name):
# consider checking for key substring too.
if is_id(column_name):
return 'count_distinct'
if np.issubdtype(dtype, np.number):
return 'sum'
return None