summaryrefslogtreecommitdiff
path: root/util/stats/dbinit.py
blob: 686f55c9811bf3ec34c25fff19612a48f090e536 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
import MySQLdb

class MyDB(object):
    def __init__(self, options):
        self.name = options.db
        self.host = options.host
        self.user = options.user
        self.passwd = options.passwd
        self.mydb = None
        self.cursor = None

    def admin(self):
        self.close()
        self.mydb = MySQLdb.connect(db='mysql', host=self.host, user=self.user,
                                    passwd=self.passwd)
        self.cursor = self.mydb.cursor()

    def connect(self):
        self.close()
        self.mydb = MySQLdb.connect(db=self.name, host=self.host,
                                    user=self.user, passwd=self.passwd)
        self.cursor = self.mydb.cursor()

    def close(self):
        if self.mydb is not None:
            self.mydb.close()
        self.cursor = None

    def query(self, sql):
        self.cursor.execute(sql)

    def drop(self):
        self.query('DROP DATABASE IF EXISTS %s' % self.name)

    def create(self):
        self.query('CREATE DATABASE %s' % self.name)

    def populate(self):
        #
        # Each run (or simulation) gets its own entry in the runs table to
        # group stats by where they were generated
        #
        # COLUMNS:
        #   'id' is a unique identifier for each run to be used in other
        #       tables.
        #   'name' is the user designated name for the data generated.  It is
        #       configured in the simulator.
        #   'user' identifies the user that generated the data for the given
        #       run.
        #   'project' another name to identify runs for a specific goal
        #   'date' is a timestamp for when the data was generated.  It can be
        #       used to easily expire data that was generated in the past.
        #   'expire' is a timestamp for when the data should be removed from
        #       the database so we don't have years worth of junk.
        #
        # INDEXES:
        #   'run' is indexed so you can find out details of a run if the run
        #       was retreived from the data table.
        #   'name' is indexed so that two all run names are forced to be unique
        #
        self.query('''
        CREATE TABLE runs(
            rn_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
            rn_name	VARCHAR(200)		NOT NULL,
            rn_sample	VARCHAR(32)		NOT NULL,
            rn_user	VARCHAR(32)		NOT NULL,
            rn_project	VARCHAR(100)            NOT NULL,
            rn_date	TIMESTAMP		NOT NULL,
            rn_expire	TIMESTAMP               NOT NULL,
            PRIMARY KEY (rn_id),
            UNIQUE (rn_name,rn_sample)
        ) TYPE=InnoDB''')

        #
        # We keep the bin names separate so that the data table doesn't get
        # huge since bin names are frequently repeated.
        #
        # COLUMNS:
        #   'id' is the unique bin identifer.
        #   'name' is the string name for the bin.
        #
        # INDEXES:
        #   'bin' is indexed to get the name of a bin when data is retrieved
        #       via the data table.
        #   'name' is indexed to get the bin id for a named bin when you want
        #       to search the data table based on a specific bin.
        #
        self.query('''
        CREATE TABLE bins(
            bn_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
            bn_name	VARCHAR(255)		NOT NULL,
            PRIMARY KEY(bn_id),
            UNIQUE (bn_name)
        ) TYPE=InnoDB''')

        #
        # The stat table gives us all of the data for a particular stat.
        #
        # COLUMNS:
        #   'stat' is a unique identifier for each stat to be used in other
        #       tables for references.
        #   'name' is simply the simulator derived name for a given
        #       statistic.
        #   'descr' is the description of the statistic and what it tells
        #       you.
        #   'type' defines what the stat tells you.  Types are:
        #       SCALAR: A simple scalar statistic that holds one value
        #       VECTOR: An array of statistic values.  Such a something that
        #           is generated per-thread.  Vectors exist to give averages,
        #	     pdfs, cdfs, means, standard deviations, etc across the
        #           stat values.
        #       DIST: Is a distribution of data.  When the statistic value is
        #	     sampled, its value is counted in a particular bucket.
        #           Useful for keeping track of utilization of a resource.
        #           (e.g. fraction of time it is 25% used vs. 50% vs. 100%)
        #       VECTORDIST: Can be used when the distribution needs to be
        #	     factored out into a per-thread distribution of data for
        #	     example.  It can still be summed across threads to find
        #           the total distribution.
        #       VECTOR2D: Can be used when you have a stat that is not only
        #           per-thread, but it is per-something else.  Like
        #           per-message type.
        #       FORMULA: This statistic is a formula, and its data must be
        #	     looked up in the formula table, for indicating how to
        #           present its values.
        #   'subdata' is potentially used by any of the vector types to
        #       give a specific name to all of the data elements within a
        #       stat.
        #   'print' indicates whether this stat should be printed ever.
        #       (Unnamed stats don't usually get printed)
        #   'prereq' only print the stat if the prereq is not zero.
        #   'prec' number of decimal places to print
        #   'nozero' don't print zero values
        #   'nonan' don't print NaN values
        #   'total' for vector type stats, print the total.
        #   'pdf' for vector type stats, print the pdf.
        #   'cdf' for vector type stats, print the cdf.
        #
        #   The Following are for dist type stats:
        #   'min' is the minimum bucket value. Anything less is an underflow.
        #   'max' is the maximum bucket value. Anything more is an overflow.
        #   'bktsize' is the approximate number of entries in each bucket.
        #   'size' is the number of buckets. equal to (min/max)/bktsize.
        #
        # INDEXES:
        #   'stat' is indexed so that you can find out details about a stat
        #       if the stat id was retrieved from the data table.
        #   'name' is indexed so that you can simply look up data about a
        #       named stat.
        #
        self.query('''
        CREATE TABLE stats(
            st_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
            st_name	VARCHAR(255)		NOT NULL,
            st_descr	TEXT			NOT NULL,
            st_type	ENUM("SCALAR", "VECTOR", "DIST", "VECTORDIST",
                "VECTOR2D", "FORMULA")	NOT NULL,
            st_print	BOOL			NOT NULL,
            st_prereq	SMALLINT UNSIGNED	NOT NULL,
            st_prec	TINYINT			NOT NULL,
            st_nozero	BOOL			NOT NULL,
            st_nonan	BOOL			NOT NULL,
            st_total	BOOL			NOT NULL,
            st_pdf	BOOL			NOT NULL,
            st_cdf	BOOL			NOT NULL,
            st_min	DOUBLE			NOT NULL,
            st_max	DOUBLE			NOT NULL,
            st_bktsize	DOUBLE			NOT NULL,
            st_size	SMALLINT UNSIGNED	NOT NULL,
            PRIMARY KEY (st_id),
            UNIQUE (st_name)
        ) TYPE=InnoDB''')

        #
        # This is the main table of data for stats.
        #
        # COLUMNS:
        #   'stat' refers to the stat field given in the stat table.
        #
        #   'x' referrs to the first dimension of a multi-dimensional stat. For
        #       a vector, x will start at 0 and increase for each vector
        #       element.
        #       For a distribution:
        #       -1: sum (for calculating standard deviation)
        #       -2: sum of squares (for calculating standard deviation)
        #       -3: total number of samples taken (for calculating
        #           standard deviation)
        #       -4: minimum value
        #       -5: maximum value
        #       -6: underflow
        #       -7: overflow
        #   'y' is used by a VECTORDIST and the VECTOR2D to describe the second
        #       dimension.
        #   'run' is the run that the data was generated from.  Details up in
        #       the run table
        #   'tick' is a timestamp generated by the simulator.
        #   'bin' is the name of the bin that the data was generated in, if
        #       any.
        #   'data' is the actual stat value.
        #
        # INDEXES:
        #   'stat' is indexed so that a user can find all of the data for a
        #       particular stat. It is not unique, because that specific stat
        #       can be found in many runs, bins, and samples, in addition to
        #       having entries for the mulidimensional cases.
        #   'run' is indexed to allow a user to remove all of the data for a
        #       particular execution run.  It can also be used to allow the
        #       user to print out all of the data for a given run.
        #
        self.query('''
        CREATE TABLE data(
            dt_stat	SMALLINT UNSIGNED	NOT NULL,
            dt_x	SMALLINT		NOT NULL,
            dt_y	SMALLINT		NOT NULL,
            dt_run	SMALLINT UNSIGNED	NOT NULL,
            dt_tick	BIGINT UNSIGNED		NOT NULL,
            dt_bin	SMALLINT UNSIGNED	NOT NULL,
            dt_data	DOUBLE			NOT NULL,
            INDEX (dt_stat),
            INDEX (dt_run),
            UNIQUE (dt_stat,dt_x,dt_y,dt_run,dt_tick,dt_bin)
        ) TYPE=InnoDB;''')

        #
        # Names and descriptions for multi-dimensional stats (vectors, etc.)
        # are stored here instead of having their own entry in the statistics
        # table. This allows all parts of a single stat to easily share a
        # single id.
        #
        # COLUMNS:
        #   'stat' is the unique stat identifier from the stat table.
        #   'x' is the first dimension for multi-dimensional stats
        #       corresponding to the data table above.
        #   'y' is the second dimension for multi-dimensional stats
        #       corresponding to the data table above.
        #   'name' is the specific subname for the unique stat,x,y combination.
        #   'descr' is the specific description for the uniqe stat,x,y
        #        combination.
        #
        # INDEXES:
        #   'stat' is indexed so you can get the subdata for a specific stat.
        #
        self.query('''
        CREATE TABLE subdata(
            sd_stat	SMALLINT UNSIGNED	NOT NULL,
            sd_x	SMALLINT		NOT NULL,
            sd_y	SMALLINT		NOT NULL,
            sd_name	VARCHAR(255)		NOT NULL,
            sd_descr	TEXT,
            UNIQUE (sd_stat,sd_x,sd_y)
        ) TYPE=InnoDB''')


        #
        # The formula table is maintained separately from the data table
        # because formula data, unlike other stat data cannot be represented
        # there.
        #
        # COLUMNS:
        #   'stat' refers to the stat field generated in the stat table.
        #   'formula' is the actual string representation of the formula
        #       itself.
        #
        # INDEXES:
        #   'stat' is indexed so that you can just look up a formula.
        #
        self.query('''
        CREATE TABLE formulas(
            fm_stat	SMALLINT UNSIGNED	NOT NULL,
            fm_formula	BLOB			NOT NULL,
            PRIMARY KEY(fm_stat)
        ) TYPE=InnoDB''')

        #
        # Each stat used in each formula is kept in this table.  This way, if
        # you want to print out a particular formula, you can simply find out
        # which stats you need by looking in this table.  Additionally, when
        # you remove a stat from the stats table and data table, you remove
        # any references to the formula in this table.  When a formula is no
        # longer referred to, you remove its entry.
        #
        # COLUMNS:
        #   'stat' is the stat id from the stat table above.
        #   'child' is the stat id of a stat that is used for this formula.
        #       There may be many children for any given 'stat' (formula)
        #
        # INDEXES:
        #   'stat' is indexed so you can look up all of the children for a
        #       particular stat.
        #   'child' is indexed so that you can remove an entry when a stat is
        #       removed.
        #
        self.query('''
        CREATE TABLE formula_ref(
            fr_stat	SMALLINT UNSIGNED	NOT NULL,
            fr_run	SMALLINT UNSIGNED	NOT NULL,
            UNIQUE (fr_stat,fr_run),
            INDEX (fr_stat),
            INDEX (fr_run)
        ) TYPE=InnoDB''')

        # COLUMNS:
        #   'event' is the unique event id from the event_desc table
        #   'run' is simulation run id that this event took place in
        #   'tick' is the tick when the event happened
        #
        # INDEXES:
        #   'event' is indexed so you can look up all occurences of a
        #       specific event
        #   'run' is indexed so you can find all events in a run
        #   'tick' is indexed because we want the unique thing anyway
        #   'event,run,tick' is unique combination
        self.query('''
        CREATE TABLE events(
            ev_event	SMALLINT UNSIGNED	NOT NULL,
            ev_run	SMALLINT UNSIGNED	NOT NULL,
            ev_tick	BIGINT   UNSIGNED       NOT NULL,
            INDEX(ev_event),
            INDEX(ev_run),
            INDEX(ev_tick),
            UNIQUE(ev_event,ev_run,ev_tick)
        ) TYPE=InnoDB''')

        # COLUMNS:
        #   'id' is the unique description id
        #   'name' is the name of the event that occurred
        #
        # INDEXES:
        #   'id' is indexed because it is the primary key and is what you use
        #       to look up the descriptions
        #   'name' is indexed so one can find the event based on name
        #
        self.query('''
        CREATE TABLE event_names(
            en_id	SMALLINT UNSIGNED	NOT NULL AUTO_INCREMENT,
            en_name	VARCHAR(255)		NOT NULL,
            PRIMARY KEY (en_id),
            UNIQUE (en_name)
        ) TYPE=InnoDB''')

    def clean(self):
        self.query('''
        DELETE data
        FROM data
        LEFT JOIN runs ON dt_run=rn_id
        WHERE rn_id IS NULL''')

        self.query('''
        DELETE formula_ref
        FROM formula_ref
        LEFT JOIN runs ON fr_run=rn_id
        WHERE rn_id IS NULL''')

        self.query('''
        DELETE formulas
        FROM formulas
        LEFT JOIN formula_ref ON fm_stat=fr_stat
        WHERE fr_stat IS NULL''')

        self.query('''
        DELETE stats
        FROM stats
        LEFT JOIN data ON st_id=dt_stat
        WHERE dt_stat IS NULL''')

        self.query('''
        DELETE subdata
        FROM subdata
        LEFT JOIN data ON sd_stat=dt_stat
        WHERE dt_stat IS NULL''')

        self.query('''
        DELETE bins
        FROM bins
        LEFT JOIN data ON bn_id=dt_bin
        WHERE dt_bin IS NULL''')

        self.query('''
        DELETE events
        FROM events
        LEFT JOIN runs ON ev_run=rn_id
        WHERE rn_id IS NULL''')

        self.query('''
        DELETE event_names
        FROM event_names
        LEFT JOIN events ON en_id=ev_event
        WHERE ev_event IS NULL''')