-
Notifications
You must be signed in to change notification settings - Fork 2
/
RandomDataMaker.py
207 lines (178 loc) · 6.25 KB
/
RandomDataMaker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
from Graph import *
import numpy as np
from itertools import product
import pandas as pd
from random import randint, uniform
import math
def my_random(bound):
"""
For bound <=1, this method returns a float chosen randomly from the
uniform distribution over the interval [-bound, bound]. For bound >1,
this method returns an integer chosen randomly from the uniform integer
distribution over the set {-ceil(bound), -ceil(bound) +1, ...,
ceil(bound)}.
Parameters
----------
bound: float
Returns
-------
int or float
"""
assert bound > 0
if bound > 1:
b = math.ceil(bound)
return randint(-b, b)
else:
return uniform(-bound, bound)
class RandomDataMaker:
"""
This purpose of this class is to generate, for a linear SCM WITHOUT
feedback loops, a synthetic dataset with: (1) column labels= the names
of the nodes graph.ord_nodes, (2) node values in each row. To generate
this, we require 'alpha_mat', 'graph', 'mean_eps' and 'sigma_eps'.
\epsilon_j is a gaussian random variable representing the external root
node pointing into x_j.
Attributes
----------
alpha_mat: np.array of shape=(dim,dim), where dim = number of nodes.
The matrix of alphas (i.e., gains \alpha_{i|j})
graph: Graph
information about DAG structure
mean_eps: list[float]
list of the mean values of the gaussian random variables
\epsilon_j. The entries in this list are ordered according to
'graph.ord_nodes'
sigma_eps: list[float]
list of the standard deviations of the gaussian random variables
\epsilon_j. The entries in this list are ordered according to
'graph.ord_nodes'
"""
def __init__(self, graph, mean_eps, sig_eps, alpha_mat=None,
alpha_bound=1):
"""
Constructor.
For this constructor, an alpha_mat not equal to 'None' can be
submitted, or, if alpha_mat == None, an alpha_mat will be generated
randomly using my_random() to generate each entry.
Parameters
----------
graph: Graph
mean_eps: list[float]
sig_eps: list[float]
alpha_mat: np.array of shape=(dim, dim)
alpha_bound: float
must be a positive number.
"""
self.graph = graph
dim = graph.num_nds
assert len(mean_eps) == dim
self.mean_eps = mean_eps
assert len(sig_eps) == dim
self.sigma_eps = sig_eps
if alpha_mat is None:
self.alpha_mat = RandomDataMaker.\
generate_random_alpha_mat(graph, alpha_bound)
else:
assert alpha_mat.shape == (dim, dim)
self.alpha_mat = alpha_mat
@staticmethod
def generate_random_alpha_mat(graph, alpha_bound=1):
"""
In this internal method, the gains \alpha_{i|j} are generated
randomly using my_random() to generate each.
Parameters
----------
graph: Graph
alpha_bound: float
must be a positive number.
Returns
-------
np.array of shape=(dim, dim)
"""
dim = graph.num_nds
alpha_mat = np.zeros((dim, dim))
for row, col in product(range(dim), range(dim)):
row_nd = graph.ord_nodes[row]
col_nd = graph.ord_nodes[col]
if row > col and (col_nd, row_nd) in graph.arrows:
alpha_mat[row, col] = my_random(alpha_bound)
return alpha_mat
def generate_one_random_instance(self):
"""
This internal method returns an array with random values for the
nodes 'graph.ord_nodes'.
Returns
-------
np.array of shape=(dim,)
"""
dim = self.graph.num_nds
nd_values = [0]*dim
for i in range(dim):
nd_values[i] = np.random.normal(loc=10, scale=self.sigma_eps[i])
for j in range(dim):
if i > j:
nd_values[i] += self.alpha_mat[i, j]*nd_values[j]
return nd_values
def write_dataset_csv(self, num_rows, path):
"""
This method writes a file which contains a dataset in the
comma-separated-values (csv) format. The dataset has (1) column
labels= the names of the nodes graph.ord_nodes, (2) node values in
each row.
Parameters
----------
num_rows: int
number of rows of the dataset
path: str
path to the destination of the output file
Returns
-------
None
"""
df = pd.DataFrame(columns=self.graph.ord_nodes)
for row in range(num_rows):
df.loc[row] = self.generate_one_random_instance()
df.to_csv(path, index=False)
if __name__ == "__main__":
def main(draw):
dot = "digraph G {\n" \
"a->b;\n" \
"a->s;\n" \
"n->s,a,b;\n" \
"}"
with open("tempo13.txt", "w") as file:
file.write(dot)
dot_path = 'tempo13.txt'
# path = 'dot_atlas/good_bad_trols_G1.dot'
graph = Graph(dot_path)
if draw:
graph.draw(jupyter=False)
dim = graph.num_nds
mean_eps = [0]*dim
sig_eps = [10]*dim
alpha_bound = 10
dmaker = RandomDataMaker(graph,
mean_eps=mean_eps,
sig_eps=sig_eps,
alpha_bound=alpha_bound)
data_path = "test_data.csv"
num_rows = 100
dmaker.write_dataset_csv(num_rows, data_path)
print("alpha_mat=\n", dmaker.alpha_mat)
print(pd.read_csv(data_path))
print("------------------------------")
alpha_mat = np.zeros((dim, dim))
alpha_mat[1, 0] = 4
alpha_mat[2, 0], alpha_mat[2, 1] = 2, -3
alpha_mat[3, 0], alpha_mat[3, 1] = 1, -1
mean_eps = [0]*dim
sig_eps = [0.0]*dim
dmaker = RandomDataMaker(graph,
mean_eps=mean_eps,
sig_eps=sig_eps,
alpha_mat=alpha_mat)
data_path = "test_data.csv"
num_rows = 5
dmaker.write_dataset_csv(num_rows, data_path)
print(pd.read_csv(data_path))
main(True)