diff --git a/mcc.py b/mcc.py index 11d6cb61bd0614389eae4d17f5477af5ff6d2952..f2fb87ee12b85e186fa201804314510a72a2cc5a 100755 --- a/mcc.py +++ b/mcc.py @@ -1,8 +1,15 @@ #!.venv/bin/python3 import pyAgrum as gum -import psycopg2 +import psycopg2 +import logging +from psycopg2.extras import LoggingConnection import sys import argparse +import time + + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) def is_var_indicator(v): return not is_var(v) @@ -34,7 +41,8 @@ g.toCSV(out_file) # PG connection conn = psycopg2.connect(database="mcc", user='postgres', password='postgres', - host='127.0.0.1', port='5432') + host='127.0.0.1', port='5432' )#, connection_factory=LoggingConnection) +# conn.initialize(logger) conn.autocommit = True cursor = conn.cursor() @@ -152,10 +160,10 @@ cursor.execute(deletion_sql) ### Computation of the MCC + tuple_label_table = "{}_tuple_label".format(table_prefix) block_label_table = "{}_block_label".format(table_prefix) -equigraph_table = "{}_equigraph".format(table_prefix) -matching_table = "{}_matching".format(table_prefix) +graph_table = "{}_graph".format(table_prefix) def tuple_joint_probability(tuple): i = bn.completeInstantiation() @@ -168,7 +176,6 @@ def tuple_joint_probability(tuple): i.chgVal(v, 1) n += 1 return p.get(i) - # return 2*id/(nb_tuples * (nb_tuples + 1)) def tuple_occurence_weight(tuple, occurence): return 2*db_size*tuple_joint_probability(tuple) - 2*occurence +1 @@ -196,66 +203,104 @@ def initialize_labels(): deletion_sql = "DROP TABLE IF EXISTS {};".format(block_label_table) cursor.execute(deletion_sql) cursor.execute("CREATE TABLE {} AS SELECT m.block, MAX(tl.weight) as label from {} as m, {} as s, {} as tl where m.superblock = s.superblock and s.tuple = tl.tuple GROUP BY m.block; ".format(block_label_table, missing_data_table, superblock_table, tuple_label_table)) + cursor.execute("CREATE UNIQUE INDEX ON {} (block)".format(block_label_table)) -def initialize_equigraph(): - deletion_sql = "DROP TABLE IF EXISTS {};".format(equigraph_table) +def initialize_graph(): + deletion_sql = "DROP TABLE IF EXISTS {};".format(graph_table) cursor.execute(deletion_sql) - cursor.execute("CREATE TABLE {} (block INTEGER, tuple INTEGER, max_occurence INTEGER) ".format(equigraph_table)) - cursor.execute("INSERT INTO {}(block, tuple, max_occurence) SELECT m.block, s.tuple, 0 AS max_occurence FROM {} as m, {} as s where m.superblock = s.superblock".format(equigraph_table, missing_data_table, superblock_table)) - cursor.execute("UPDATE {} SET max_occurence = 1 WHERE (tuple, block) IN (SELECT tl.tuple, bl.block FROM {} AS tl, {} AS bl WHERE tl.weight = bl.label);".format(equigraph_table, tuple_label_table, block_label_table)) - -def initialize_matching(): - deletion_sql = "DROP TABLE IF EXISTS {};".format(matching_table) - cursor.execute(deletion_sql) - cursor.execute("CREATE TABLE {} (block INTEGER, tuple INTEGER, occurence INTEGER) ".format(matching_table)) + cursor.execute("CREATE TABLE {} (block INTEGER, tuple INTEGER, occurence INTEGER, eq BOOLEAN, matching BOOLEAN DEFAULT false, PRIMARY KEY(block, tuple, occurence)) ".format(graph_table)) + cursor.execute("INSERT INTO {}(block, tuple, occurence, eq) SELECT m.block, s.tuple, 1 AS occurence, false AS eq FROM {} as m, {} as s where m.superblock = s.superblock".format(graph_table, missing_data_table, superblock_table)) + cursor.execute("UPDATE {} SET eq = true WHERE (tuple, block) IN (SELECT tl.tuple, bl.block FROM {} AS tl, {} AS bl WHERE tl.weight = bl.label);".format(graph_table, tuple_label_table, block_label_table)) + cursor.execute("CREATE INDEX ON {} (block)".format(graph_table)) + cursor.execute("CREATE INDEX ON {} (tuple,occurence)".format(graph_table)) + cursor.execute("CREATE INDEX ON {} (eq)".format(graph_table)) + cursor.execute("CREATE INDEX ON {} (matching)".format(graph_table)) + +# def initialize_matching(): +# deletion_sql = "DROP TABLE IF EXISTS {};".format(matching_table) +# cursor.execute(deletion_sql) +# cursor.execute("CREATE TABLE {} (block INTEGER, tuple INTEGER, occurence INTEGER) ".format(matching_table)) # return a block free vertex in the equigraph, if it exists def pick_free_block(): - cursor.execute("SELECT block FROM {} WHERE block NOT IN (SELECT block FROM {})".format(block_label_table, matching_table)) + cursor.execute("SELECT DISTINCT block FROM {} WHERE block NOT IN (SELECT block FROM {} WHERE matching = true)".format(graph_table, graph_table)) res = cursor.fetchone() return res[0] if res is not None else None +neigh_time = 0 # return the (tuple, occ) vertices that are the neighbors in the equigraph of at least one block in a set def get_neighbors(blocks): + start = time.monotonic() neighbors = set() - cursor.execute("SELECT tuple, max_occurence FROM {} WHERE max_occurence > 0 AND block IN %s".format(equigraph_table), [tuple(blocks)]) + cursor.execute("SELECT DISTINCT tuple, occurence FROM {} WHERE eq = true AND block IN %s".format(graph_table), [tuple(blocks)]) for row in cursor.fetchall(): neighbors.add(row) + + global neigh_time + neigh_time += time.monotonic() - start + return neighbors +matched_neigh_time = 0 + def get_matching_block(tuple, occurence): - cursor.execute("SELECT block FROM {} WHERE tuple = {} AND occurence = {}".format(matching_table, tuple, occurence)) + start = time.monotonic() + cursor.execute("SELECT block FROM {} WHERE tuple = {} AND occurence = {} AND matching = true".format(graph_table, tuple, occurence)) res = cursor.fetchone() + + global matched_neigh_time + matched_neigh_time += time.monotonic() - start + return res[0] if res is not None else None +matching_time = 0 + def augmenting_path_insertion(u, y, prec): + start = time.monotonic() is_edge_matched = False while y != u: if is_edge_matched : - cursor.execute("DELETE FROM {} WHERE block = %s AND tuple = %s AND occurence = %s".format(matching_table), [y, prec[y][0], prec[y][1]]) + cursor.execute("UPDATE {} SET matching = false WHERE block = %s AND tuple = %s AND occurence = %s".format(graph_table), [y, prec[y][0], prec[y][1]]) else: - cursor.execute("INSERT INTO {} (block, tuple, occurence) VALUES (%s, %s, %s)".format(matching_table), [prec[y], y[0], y[1]]) + cursor.execute("UPDATE {} SET matching = true WHERE block = %s AND tuple = %s AND occurence = %s".format(graph_table), [prec[y], y[0], y[1]]) y = prec[y] is_edge_matched = not is_edge_matched - + global matching_time + matching_time += time.monotonic() - start + +lookup_time = 0 +update_time1 = 0 +update_time2 = 0 +update_time3 = 0 +label_update_nb = 0 +edges_nb = 0 +max_S = 0 + def update_labels(S, T): + start = time.monotonic() + global label_update_nb + label_update_nb += 1 + + # first query for the edges not in the equigraph, but stored (implies occurence = 1) + # second query for the edges not in the equigraph and not stored (not considered so far) cursor.execute(""" - SELECT e.block, e.tuple, e.max_occurence + 1, tl2.weight - 2, tb.label, COALESCE(tl.label, 0) - FROM ({} AS e LEFT OUTER JOIN {} AS tl ON tl.occurence = e.max_occurence + 1 AND tl.tuple = e.tuple), {} AS tb, {} as tl2 - WHERE e.block IN %s - AND e.max_occurence < %s AND e.max_occurence != 0 - AND tb.block = e.block AND tl2.occurence = e.max_occurence AND tl2.tuple = e.tuple - UNION ALL - SELECT e.block, e.tuple, 1, tl.weight, tb.label, tl.label - FROM {} AS e, {} AS tl, {} AS tb - WHERE e.block IN %s - AND e.max_occurence = 0 AND tl.tuple = e.tuple - AND tl.occurence = 1 AND tb.block = e.block""".format(equigraph_table, tuple_label_table, block_label_table, tuple_label_table, equigraph_table, tuple_label_table, block_label_table), [tuple(S), db_size, tuple(S)]) + SELECT g.block, g.tuple, g.occurence, tl.weight, bl.label, tl.label, true AS existing + FROM {} AS g, {} AS bl, {} AS tl + WHERE tl.tuple = g.tuple AND tl.occurence = g.occurence AND bl.block = g.block + AND g.eq = false AND g.block IN %s + UNION + SELECT g.block, g.tuple, g.occurence + 1, tl.weight - 2, bl.label, COALESCE(tln.label,0), false AS existing + FROM {} AS g, {} AS bl, {} AS tl LEFT OUTER JOIN {} AS tln ON (tl.tuple = tln.tuple AND tl.occurence + 1 = tln.occurence) + WHERE tl.tuple = g.tuple AND tl.occurence = g.occurence AND bl.block = g.block + AND g.eq = true AND g.occurence < %s AND g.block IN %s + """.format(graph_table, block_label_table, tuple_label_table, graph_table, block_label_table, tuple_label_table, tuple_label_table), [tuple(S), db_size, tuple(S)]) alpha = sys.float_info.max edges_to_add = [] for row in cursor.fetchall(): + # print(row) if (row[1], row[2]) in T: + # skip the edge not considered so far whose tuple is in T continue a = row[4] + row[5] - row[3] if a < alpha : @@ -263,31 +308,59 @@ def update_labels(S, T): edges_to_add = [] if a == alpha: edges_to_add.append(row) - + mt = time.monotonic() + global lookup_time + lookup_time += mt - start + mt2=mt + cursor.execute("UPDATE {} SET label = label - %s WHERE block in %s".format(block_label_table), [alpha, tuple(S)]) if len(T) > 0: cursor.execute("UPDATE {} SET label = label + %s WHERE (tuple, occurence) in %s".format(tuple_label_table), [alpha, tuple(T)]) - cursor.execute("UPDATE {} SET max_occurence = CASE WHEN max_occurence > 0 THEN max_occurence -1 ELSE 0 END WHERE (tuple, max_occurence) in %s AND block NOT IN %s".format(equigraph_table), [tuple(T), tuple(S)]) + # remove from the equigraph the edges from T and a block not in S + mt2 = time.monotonic() + global update_time1 + update_time1 += mt2 - mt + cursor.execute("DELETE FROM {} WHERE eq = True AND occurence > 1 AND (tuple, occurence) in %s AND block NOT IN %s".format(graph_table), [tuple(T), tuple(S)]) + cursor.execute("UPDATE {} SET eq = False WHERE occurence = 1 AND (tuple, occurence) in %s AND block NOT IN %s".format(graph_table), [tuple(T), tuple(S)]) + + mt3 = time.monotonic() + global update_time2 + update_time2 += mt3 - mt2 + global edges_nb + edges_nb += len(edges_to_add) + global max_S + max_S = max(len(S), max_S) for edge in edges_to_add: - cursor.execute("INSERT INTO {} (tuple, occurence, label, weight) VALUES (%s, %s, %s, %s) ON CONFLICT DO NOTHING".format(tuple_label_table), [edge[1], edge[2], edge[5], edge[3]]) - cursor.execute("UPDATE {} SET max_occurence = %s WHERE block = %s AND tuple = %s".format(equigraph_table), [edge[2], edge[0], edge[1]]) - # input("updated labels with S={}, T={} and alpha={} \n with edges {}".format(S,T, alpha, edges_to_add)) + # if the edge exists in the graph already + if edge[6] : + cursor.execute("UPDATE {} SET eq = true WHERE block = %s AND tuple = %s AND occurence = %s".format(graph_table), [edge[0], edge[1], edge[2]]) + else: + cursor.execute("INSERT INTO {} (tuple, occurence, label, weight) VALUES (%s, %s, %s, %s) ON CONFLICT DO NOTHING".format(tuple_label_table), [edge[1], edge[2], edge[5], edge[3]]) + cursor.execute("INSERT INTO {} (block, tuple, occurence, eq) VALUES (%s, %s, %s, true)".format(graph_table), [edge[0], edge[1], edge[2]]) + + global update_time3 + update_time3 += time.monotonic() - mt3 + return list(map(lambda e: [e[0], (e[1],e[2])] ,edges_to_add)) +exploring_step_nb = 0 def hungarian_step(u, S, T, S_neigh, prec): + # print("u={}".format(u)) while True : remaining_neigh = S_neigh - T if len(remaining_neigh) > 0: + global exploring_step_nb + exploring_step_nb += 1 y = next(iter(remaining_neigh)) if len(S) == 1: prec[y] = u z = get_matching_block(y[0], y[1]) if z is None : - print("augmenting path from {} to {} with S={} and T={} \n".format(u, y, S, T)) augmenting_path_insertion(u, y, prec) + # print("augmenting path from {} to {} with S={} and T={} \n".format(u, y, S, T)) return else: S.add(z) @@ -297,7 +370,7 @@ def hungarian_step(u, S, T, S_neigh, prec): S_neigh.add(nz) if nz != y and nz not in prec : prec[nz] = z - # input("matched y={} by z={} with S={}, T={} and neigh={}".format(y,z,S,T, S_neigh)) + # print("matched y={} by z={} with S={}, T={} and neigh={}".format(y,z,S,T, S_neigh)) else: new_edges = update_labels(S, T) for e in new_edges: @@ -315,14 +388,26 @@ def hungarian_algorithm(): prec = dict() hungarian_step(u, S, T, S_neigh, prec) - initialize_labels() -initialize_equigraph() -initialize_matching() +initialize_graph() + input("starting hungarian algorithm") +start = time.monotonic() hungarian_algorithm() +print(f"exploring steps : {exploring_step_nb}") +print(f"neigh time in {neigh_time} s") +print(f"matched neigh time in {matched_neigh_time} s") +print(f"label update nb : {label_update_nb}") +print(f"lookup time in {lookup_time} s") +print(f"update time 1 in {update_time1} s") +print(f"edges nb : {edges_nb}") +print(f"max S : {max_S}") +print(f"update time 2 in {update_time2} s") +print(f"update time 3 in {update_time3} s") +print(f"matching time in {matching_time} s") +print(f"total time in {time.monotonic() - start} s") ###### Imputation