diff --git a/mcc.py b/mcc.py index 044fc821fe8c0af9a7ce3bdf22fb4e42754354f4..0ea4b82854aa4c6c4a8134a42fb0078c2cffcaaa 100755 --- a/mcc.py +++ b/mcc.py @@ -68,7 +68,7 @@ missing_data_table = "{}_star".format(table_prefix) deletion_sql = "DROP TABLE IF EXISTS {};".format(missing_data_table) cursor.execute(deletion_sql) missing_selects_text = ",".join(missing_selects) -missing_data_sql = "CREATE TABLE {} AS SELECT {}, md5(CAST(({}) AS text)) AS superblock FROM {}".format(missing_data_table, missing_selects_text, missing_selects_text, table_prefix) +missing_data_sql = "CREATE TABLE {} AS SELECT block, {}, md5(CAST(({}) AS text)) AS superblock FROM {}".format(missing_data_table, missing_selects_text, missing_selects_text, table_prefix) print(missing_data_sql) cursor.execute(missing_data_sql) @@ -84,7 +84,7 @@ col_def = map(lambda name : "{} int NOT NULL".format(name), vars) creation_sql = "CREATE TABLE {}(superblock text, {}, prob float NOT NULL);".format(superblock_tmp_table, ",".join(col_def)) cursor.execute(creation_sql) -cursor.execute("SELECT DISTINCT * FROM {}".format(missing_data_table)) +cursor.execute("SELECT DISTINCT {}, superblock FROM {}".format(",".join(vars), missing_data_table)) ie = gum.LazyPropagation(bn) insert_sql = "INSERT INTO {}({}, superblock, prob) VALUES({},%s,%s)".format(superblock_tmp_table, ",".join(vars), ",".join(["%s" for _ in vars])) @@ -92,7 +92,7 @@ insert_sql = "INSERT INTO {}({}, superblock, prob) VALUES({},%s,%s)".format(supe def instantiation_to_list(inst, row, vars): l = [] i = 0 - inst_vars = list(map(lambda v : v.name(), inst.variablesSequence())) + inst_vars = list(map(lambda v : v.name(), inst.variablesSequence())) if inst is not None else [] for v in vars: if v in inst_vars: l.append(inst[v]) @@ -109,18 +109,23 @@ for row in cursor.fetchall(): if row[i] is None: null_vars.append(v) null_pos.append(i) - ie.addTarget(v) else: ie.addEvidence(v, row[i]) i+=1 + ie.addJointTarget(set(null_vars)) if len(null_vars) > 0: potentiel = ie.jointPosterior(set(null_vars)) if len(vars) > len(null_vars) else ie.evidenceJointImpact(vars,{}) - for i in potentiel.loopIn(): - inserted_row = instantiation_to_list(i, row, vars) + for inst in potentiel.loopIn(): + inserted_row = instantiation_to_list(inst, row, vars) inserted_row.append(row[-1]) - inserted_row.append(potentiel.get(i)) + inserted_row.append(potentiel.get(inst)) cursor.execute(insert_sql, inserted_row) - ie.eraseAllTargets() + else: + inserted_row = instantiation_to_list(None, row, vars) + inserted_row.append(row[-1]) + inserted_row.append(1) + cursor.execute(insert_sql, inserted_row) + ie.eraseAllJointTargets() ie.eraseAllEvidence() # tuples table creation @@ -146,6 +151,52 @@ deletion_sql = "DROP TABLE IF EXISTS {};".format(superblock_tmp_table) cursor.execute(deletion_sql) + +### Computation of the MCC +tuple_label_table = "{}_tuple_label".format(table_prefix) +block_label_table = "{}_block_label".format(table_prefix) +equigraph_table = "{}_equigraph".format(table_prefix) + +nb_tuples = 8 + +def tuple_joint_probability(id): + return 2*id/(nb_tuples * (nb_tuples + 1)) + +def tuple_occurence_weight(id, occurence): + return 2*db_size*tuple_joint_probability(id) - 2*occurence +1 + + +def initialize_labels(): + deletion_sql = "DROP TABLE IF EXISTS {};".format(tuple_label_table) + cursor.execute(deletion_sql) + cursor.execute("CREATE TABLE {} (tuple INTEGER, occurence INTEGER, label float, weight float, PRIMARY KEY (tuple, occurence)) ".format(tuple_label_table)) + + insert_sql = "INSERT INTO {}(tuple, occurence, label, weight) VALUES(%s,%s,%s,%s)".format(tuple_label_table) + cursor.execute("SELECT tuple FROM {}".format(tuple_table)) + + for row in cursor.fetchall(): + inserted_row = [] + tuple_id = row[0] + inserted_row.append(tuple_id) + inserted_row.append(1) + inserted_row.append(0) + inserted_row.append(tuple_occurence_weight(tuple_id, 1)) + cursor.execute(insert_sql, inserted_row) + + deletion_sql = "DROP TABLE IF EXISTS {};".format(block_label_table) + cursor.execute(deletion_sql) + cursor.execute("CREATE TABLE {} AS SELECT m.block, MAX(tl.weight) as label from {} as m, {} as s, {} as tl where m.superblock = s.superblock and s.tuple = tl.tuple GROUP BY m.block; ".format(block_label_table, missing_data_table, superblock_table, tuple_label_table)) + +def initialize_equigraph(): + deletion_sql = "DROP TABLE IF EXISTS {};".format(equigraph_table) + cursor.execute(deletion_sql) + cursor.execute("CREATE TABLE {} AS SELECT m.block, s.tuple, 0 AS max_occurence FROM {} as m, {} as s where m.superblock = s.superblock".format(equigraph_table, missing_data_table, superblock_table)) + + cursor.execute("UPDATE {} SET max_occurence = 1 WHERE (tuple, block) IN (SELECT tl.tuple, bl.block FROM {} AS tl, {} AS bl WHERE tl.weight = bl.label);".format(equigraph_table, tuple_label_table, block_label_table)) + +initialize_labels() +initialize_equigraph() + ###### Imputation # import numpy as np