superset/superset/dataframe.py at master · shgy/superset

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

""" Superset wrapper around pandas.DataFrame.

TODO(bkyryliuk): add support for the conventions like: *_dim or dim_*

dimensions, *_ts, ts_*, ds_*, *_ds - datetime, etc.

TODO(bkyryliuk): recognize integer encoded enums.

"""

from __future__ import absolute_import

from __future__ import division

from __future__ import print_function

from __future__ import unicode_literals

import pandas as pd

import numpy as np

INFER_COL_TYPES_THRESHOLD = 95

INFER_COL_TYPES_SAMPLE_SIZE = 100

class SupersetDataFrame(object):

def __init__(self, df):

self.__df = df.where((pd.notnull(df)), None)

@property

def size(self):

return len(self.__df.index)

@property

def data(self):

return self.__df.to_dict(orient='records')

@property

def columns_dict(self):

"""Provides metadata about columns for data visualization.

:return: dict, with the fields name, type, is_date, is_dim and agg.

"""

if self.__df.empty:

return None

columns = []

sample_size = min(INFER_COL_TYPES_SAMPLE_SIZE, len(self.__df.index))

sample = self.__df

if sample_size:

sample = self.__df.sample(sample_size)

for col in self.__df.dtypes.keys():

column = {

'name': col,

'type': self.__df.dtypes[col].name,

'is_date': is_date(self.__df.dtypes[col]),

'is_dim': is_dimension(self.__df.dtypes[col], col),

}

agg = agg_func(self.__df.dtypes[col], col)

if agg_func:

column['agg'] = agg

if column['type'] == 'object':

# check if encoded datetime

if (datetime_conversion_rate(sample[col]) >

INFER_COL_TYPES_THRESHOLD):

column.update({

'type': 'datetime_string',

'is_date': True,

'is_dim': False,

'agg': None

})

# 'agg' is optional attribute

if not column['agg']:

column.pop('agg', None)

columns.append(column)

return columns

# It will give false positives on the numbers that are stored as strings.

# It is hard to distinguish integer numbers and timestamps

def datetime_conversion_rate(data_series):

success = 0

total = 0

for value in data_series:

total += 1

try:

pd.to_datetime(value)

success += 1

except Exception:

continue

return 100 * success / total

def is_date(dtype):

if dtype.name:

return dtype.name.startswith('datetime')

def is_dimension(dtype, column_name):

if is_id(column_name):

return False

return dtype.name in ('object', 'bool')

def is_id(column_name):

return column_name.startswith('id') or column_name.endswith('id')

def agg_func(dtype, column_name):

# consider checking for key substring too.

if is_id(column_name):

return 'count_distinct'

if np.issubdtype(dtype, np.number):

return 'sum'

return None

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

dataframe.py

dataframe.py

Files

dataframe.py

Latest commit

History

dataframe.py

File metadata and controls