diff --git a/.gitignore b/.gitignore index 5b96f6ca1e7aa28d69f6763d1d442fc093aaaf35..6adefb3178a249fd114a9f7234a53f3280055ce3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ projects/het2onto-benchmark/.tmp-conf.txt *.log +*/ltximg diff --git a/projects/index.org b/projects/index.org index 52b2547480254bd10055f45094e51bb94092a5d7..3091d224c534a35d367bcc0c97e54eaf7da48ca8 100644 --- a/projects/index.org +++ b/projects/index.org @@ -1,5 +1,6 @@ #+TITLE: Projects +- [[file:missingdata/index.org][Databases with missing data]] - [[file:obi-wan/index.org][Obi-Wan]] - [[file:het2onto-benchmark/index.org][Obi-Wan Benchmark]] - [[file:qa-test/qa-test.org][Query Answering Test]] diff --git a/projects/missingdata/.ob-jupyter/e16c9d053b3952bf48300ded8b9cfea3a1e6e881.png b/projects/missingdata/.ob-jupyter/e16c9d053b3952bf48300ded8b9cfea3a1e6e881.png new file mode 100644 index 0000000000000000000000000000000000000000..c879e52584a148a65e2e18a731bae45e78e3b9c6 Binary files /dev/null and b/projects/missingdata/.ob-jupyter/e16c9d053b3952bf48300ded8b9cfea3a1e6e881.png differ diff --git a/projects/missingdata/best-answer-vs-most-probable.org b/projects/missingdata/best-answer-vs-most-probable.org new file mode 100644 index 0000000000000000000000000000000000000000..74fb26306e6ffc64162b5f555883ea1e41052d4b --- /dev/null +++ b/projects/missingdata/best-answer-vs-most-probable.org @@ -0,0 +1,438 @@ +#+TITLE: Difference between most probable and most correct answer +#+PROPERTY: header-args :session main :exports both :results output :tangle yes + +The following python notebook shows that the following example of distribution with n rows is a case where the most probable and most correct answers are different (see the two last blocks) + +#+BEGIN_src jupyter-python :results none + dist = { + 1 : 0.75, + 2 : 0.125, + 3 : 0.125 + } + n = 8 +#+END_src + + +#+BEGIN_src jupyter-python + import pprint + # generate the possible worlds + def possible_worlds(dist, n): + worlds = [[]] + for i in range(n): + new_worlds = [] + for w in worlds: + for v in dist: + if len(w) <= i: + w.append(v) + else: + w[i] = v + new_worlds.append(w.copy()) + worlds = new_worlds + return worlds + + print(len(possible_worlds(dist, n))) +#+END_src + +#+RESULTS: +: 6561 + +#+BEGIN_src jupyter-python :results none + # compute a key for each world based on the values it contains + def world_key(w): + w.sort() + count = 0 + current = w[0] + key = "" + for i in range(len(w)): + count+=1 + if len(w)-1 == i or w[i+1] != current: + key += "{}x{}".format(count, current) + if len(w)-1 != i and w[i+1] != current: + key += " " + current = w[i+1] + count = 0 + return key + + # compute the probability of a world + def world_prob(w, dist): + prob = 1 + for v in w: + prob*=dist[v] + return prob +#+END_src + + +#+BEGIN_src jupyter-python + # computes classes of worlds based on a function computing a key for each world + # worlds are in the same class iff they have the same key + def world_classes(dist, n, class_key=world_key): + worlds = possible_worlds(dist, n) + classes = {} + for w in worlds: + key = class_key(w) + if key in classes: + classes[key]["count"]+=1 + classes[key]["class_prob"]+=world_prob(w, dist) + classes[key]["possible_values"].add(world_key(w)) + else: + wp = world_prob(w, dist) + classes[key] = { + "world_ex": w, + "possible_values": {world_key(w)}, + "class_prob": wp, + "count": 1 + } + return classes + + # the classes of possible worlds based on the function work_key + pprint.pprint(world_classes(dist, n)) +#+END_src + +#+RESULTS: +#+begin_example + {'1x1 1x2 6x3': {'class_prob': 2.002716064453125e-05, + 'count': 56, + 'possible_values': {'1x1 1x2 6x3'}, + 'world_ex': [1, 2, 3, 3, 3, 3, 3, 3]}, + '1x1 2x2 5x3': {'class_prob': 6.008148193359375e-05, + 'count': 168, + 'possible_values': {'1x1 2x2 5x3'}, + 'world_ex': [1, 2, 2, 3, 3, 3, 3, 3]}, + '1x1 3x2 4x3': {'class_prob': 0.00010013580322265625, + 'count': 280, + 'possible_values': {'1x1 3x2 4x3'}, + 'world_ex': [1, 2, 2, 2, 3, 3, 3, 3]}, + '1x1 4x2 3x3': {'class_prob': 0.00010013580322265625, + 'count': 280, + 'possible_values': {'1x1 4x2 3x3'}, + 'world_ex': [1, 2, 2, 2, 2, 3, 3, 3]}, + '1x1 5x2 2x3': {'class_prob': 6.008148193359375e-05, + 'count': 168, + 'possible_values': {'1x1 5x2 2x3'}, + 'world_ex': [1, 2, 2, 2, 2, 2, 3, 3]}, + '1x1 6x2 1x3': {'class_prob': 2.002716064453125e-05, + 'count': 56, + 'possible_values': {'1x1 6x2 1x3'}, + 'world_ex': [1, 2, 2, 2, 2, 2, 2, 3]}, + '1x1 7x2': {'class_prob': 2.86102294921875e-06, + 'count': 8, + 'possible_values': {'1x1 7x2'}, + 'world_ex': [1, 2, 2, 2, 2, 2, 2, 2]}, + '1x1 7x3': {'class_prob': 2.86102294921875e-06, + 'count': 8, + 'possible_values': {'1x1 7x3'}, + 'world_ex': [1, 3, 3, 3, 3, 3, 3, 3]}, + '1x2 7x3': {'class_prob': 4.76837158203125e-07, + 'count': 8, + 'possible_values': {'1x2 7x3'}, + 'world_ex': [2, 3, 3, 3, 3, 3, 3, 3]}, + '2x1 1x2 5x3': {'class_prob': 0.0003604888916015625, + 'count': 168, + 'possible_values': {'2x1 1x2 5x3'}, + 'world_ex': [1, 1, 2, 3, 3, 3, 3, 3]}, + '2x1 2x2 4x3': {'class_prob': 0.0009012222290039062, + 'count': 420, + 'possible_values': {'2x1 2x2 4x3'}, + 'world_ex': [1, 1, 2, 2, 3, 3, 3, 3]}, + '2x1 3x2 3x3': {'class_prob': 0.001201629638671875, + 'count': 560, + 'possible_values': {'2x1 3x2 3x3'}, + 'world_ex': [1, 1, 2, 2, 2, 3, 3, 3]}, + '2x1 4x2 2x3': {'class_prob': 0.0009012222290039062, + 'count': 420, + 'possible_values': {'2x1 4x2 2x3'}, + 'world_ex': [1, 1, 2, 2, 2, 2, 3, 3]}, + '2x1 5x2 1x3': {'class_prob': 0.0003604888916015625, + 'count': 168, + 'possible_values': {'2x1 5x2 1x3'}, + 'world_ex': [1, 1, 2, 2, 2, 2, 2, 3]}, + '2x1 6x2': {'class_prob': 6.008148193359375e-05, + 'count': 28, + 'possible_values': {'2x1 6x2'}, + 'world_ex': [1, 1, 2, 2, 2, 2, 2, 2]}, + '2x1 6x3': {'class_prob': 6.008148193359375e-05, + 'count': 28, + 'possible_values': {'2x1 6x3'}, + 'world_ex': [1, 1, 3, 3, 3, 3, 3, 3]}, + '2x2 6x3': {'class_prob': 1.6689300537109375e-06, + 'count': 28, + 'possible_values': {'2x2 6x3'}, + 'world_ex': [2, 2, 3, 3, 3, 3, 3, 3]}, + '3x1 1x2 4x3': {'class_prob': 0.003604888916015625, + 'count': 280, + 'possible_values': {'3x1 1x2 4x3'}, + 'world_ex': [1, 1, 1, 2, 3, 3, 3, 3]}, + '3x1 2x2 3x3': {'class_prob': 0.00720977783203125, + 'count': 560, + 'possible_values': {'3x1 2x2 3x3'}, + 'world_ex': [1, 1, 1, 2, 2, 3, 3, 3]}, + '3x1 3x2 2x3': {'class_prob': 0.00720977783203125, + 'count': 560, + 'possible_values': {'3x1 3x2 2x3'}, + 'world_ex': [1, 1, 1, 2, 2, 2, 3, 3]}, + '3x1 4x2 1x3': {'class_prob': 0.003604888916015625, + 'count': 280, + 'possible_values': {'3x1 4x2 1x3'}, + 'world_ex': [1, 1, 1, 2, 2, 2, 2, 3]}, + '3x1 5x2': {'class_prob': 0.000720977783203125, + 'count': 56, + 'possible_values': {'3x1 5x2'}, + 'world_ex': [1, 1, 1, 2, 2, 2, 2, 2]}, + '3x1 5x3': {'class_prob': 0.000720977783203125, + 'count': 56, + 'possible_values': {'3x1 5x3'}, + 'world_ex': [1, 1, 1, 3, 3, 3, 3, 3]}, + '3x2 5x3': {'class_prob': 3.337860107421875e-06, + 'count': 56, + 'possible_values': {'3x2 5x3'}, + 'world_ex': [2, 2, 2, 3, 3, 3, 3, 3]}, + '4x1 1x2 3x3': {'class_prob': 0.02162933349609375, + 'count': 280, + 'possible_values': {'4x1 1x2 3x3'}, + 'world_ex': [1, 1, 1, 1, 2, 3, 3, 3]}, + '4x1 2x2 2x3': {'class_prob': 0.032444000244140625, + 'count': 420, + 'possible_values': {'4x1 2x2 2x3'}, + 'world_ex': [1, 1, 1, 1, 2, 2, 3, 3]}, + '4x1 3x2 1x3': {'class_prob': 0.02162933349609375, + 'count': 280, + 'possible_values': {'4x1 3x2 1x3'}, + 'world_ex': [1, 1, 1, 1, 2, 2, 2, 3]}, + '4x1 4x2': {'class_prob': 0.0054073333740234375, + 'count': 70, + 'possible_values': {'4x1 4x2'}, + 'world_ex': [1, 1, 1, 1, 2, 2, 2, 2]}, + '4x1 4x3': {'class_prob': 0.0054073333740234375, + 'count': 70, + 'possible_values': {'4x1 4x3'}, + 'world_ex': [1, 1, 1, 1, 3, 3, 3, 3]}, + '4x2 4x3': {'class_prob': 4.172325134277344e-06, + 'count': 70, + 'possible_values': {'4x2 4x3'}, + 'world_ex': [2, 2, 2, 2, 3, 3, 3, 3]}, + '5x1 1x2 2x3': {'class_prob': 0.0778656005859375, + 'count': 168, + 'possible_values': {'5x1 1x2 2x3'}, + 'world_ex': [1, 1, 1, 1, 1, 2, 3, 3]}, + '5x1 2x2 1x3': {'class_prob': 0.0778656005859375, + 'count': 168, + 'possible_values': {'5x1 2x2 1x3'}, + 'world_ex': [1, 1, 1, 1, 1, 2, 2, 3]}, + '5x1 3x2': {'class_prob': 0.0259552001953125, + 'count': 56, + 'possible_values': {'5x1 3x2'}, + 'world_ex': [1, 1, 1, 1, 1, 2, 2, 2]}, + '5x1 3x3': {'class_prob': 0.0259552001953125, + 'count': 56, + 'possible_values': {'5x1 3x3'}, + 'world_ex': [1, 1, 1, 1, 1, 3, 3, 3]}, + '5x2 3x3': {'class_prob': 3.337860107421875e-06, + 'count': 56, + 'possible_values': {'5x2 3x3'}, + 'world_ex': [2, 2, 2, 2, 2, 3, 3, 3]}, + '6x1 1x2 1x3': {'class_prob': 0.155731201171875, + 'count': 56, + 'possible_values': {'6x1 1x2 1x3'}, + 'world_ex': [1, 1, 1, 1, 1, 1, 2, 3]}, + '6x1 2x2': {'class_prob': 0.0778656005859375, + 'count': 28, + 'possible_values': {'6x1 2x2'}, + 'world_ex': [1, 1, 1, 1, 1, 1, 2, 2]}, + '6x1 2x3': {'class_prob': 0.0778656005859375, + 'count': 28, + 'possible_values': {'6x1 2x3'}, + 'world_ex': [1, 1, 1, 1, 1, 1, 3, 3]}, + '6x2 2x3': {'class_prob': 1.6689300537109375e-06, + 'count': 28, + 'possible_values': {'6x2 2x3'}, + 'world_ex': [2, 2, 2, 2, 2, 2, 3, 3]}, + '7x1 1x2': {'class_prob': 0.13348388671875, + 'count': 8, + 'possible_values': {'7x1 1x2'}, + 'world_ex': [1, 1, 1, 1, 1, 1, 1, 2]}, + '7x1 1x3': {'class_prob': 0.13348388671875, + 'count': 8, + 'possible_values': {'7x1 1x3'}, + 'world_ex': [1, 1, 1, 1, 1, 1, 1, 3]}, + '7x2 1x3': {'class_prob': 4.76837158203125e-07, + 'count': 8, + 'possible_values': {'7x2 1x3'}, + 'world_ex': [2, 2, 2, 2, 2, 2, 2, 3]}, + '8x1': {'class_prob': 0.1001129150390625, + 'count': 1, + 'possible_values': {'8x1'}, + 'world_ex': [1, 1, 1, 1, 1, 1, 1, 1]}, + '8x2': {'class_prob': 5.960464477539063e-08, + 'count': 1, + 'possible_values': {'8x2'}, + 'world_ex': [2, 2, 2, 2, 2, 2, 2, 2]}, + '8x3': {'class_prob': 5.960464477539063e-08, + 'count': 1, + 'possible_values': {'8x3'}, + 'world_ex': [3, 3, 3, 3, 3, 3, 3, 3]}} +#+end_example + +#+BEGIN_src jupyter-python + # the classes of possible worlds where a class contains the world having the same sum of values + pprint.pprint(world_classes(dist, n, sum)) +#+END_src + +#+RESULTS: +#+begin_example + {8: {'class_prob': 0.1001129150390625, + 'count': 1, + 'possible_values': {'8x1'}, + 'world_ex': [1, 1, 1, 1, 1, 1, 1, 1]}, + 9: {'class_prob': 0.13348388671875, + 'count': 8, + 'possible_values': {'7x1 1x2'}, + 'world_ex': [1, 1, 1, 1, 1, 1, 1, 2]}, + 10: {'class_prob': 0.2113494873046875, + 'count': 36, + 'possible_values': {'7x1 1x3', '6x1 2x2'}, + 'world_ex': [1, 1, 1, 1, 1, 1, 1, 3]}, + 11: {'class_prob': 0.1816864013671875, + 'count': 112, + 'possible_values': {'5x1 3x2', '6x1 1x2 1x3'}, + 'world_ex': [1, 1, 1, 1, 1, 1, 2, 3]}, + 12: {'class_prob': 0.16113853454589844, + 'count': 266, + 'possible_values': {'4x1 4x2', '5x1 2x2 1x3', '6x1 2x3'}, + 'world_ex': [1, 1, 1, 1, 1, 1, 3, 3]}, + 13: {'class_prob': 0.10021591186523438, + 'count': 504, + 'possible_values': {'5x1 1x2 2x3', '4x1 3x2 1x3', '3x1 5x2'}, + 'world_ex': [1, 1, 1, 1, 1, 2, 3, 3]}, + 14: {'class_prob': 0.062064170837402344, + 'count': 784, + 'possible_values': {'4x1 2x2 2x3', '2x1 6x2', '3x1 4x2 1x3', '5x1 3x3'}, + 'world_ex': [1, 1, 1, 1, 1, 3, 3, 3]}, + 15: {'class_prob': 0.02920246124267578, + 'count': 1016, + 'possible_values': {'1x1 7x2', + '2x1 5x2 1x3', + '3x1 3x2 2x3', + '4x1 1x2 3x3'}, + 'world_ex': [1, 1, 1, 1, 2, 3, 3, 3]}, + 16: {'class_prob': 0.0135384202003479, + 'count': 1107, + 'possible_values': {'1x1 6x2 1x3', + '2x1 4x2 2x3', + '3x1 2x2 3x3', + '4x1 4x3', + '8x2'}, + 'world_ex': [1, 1, 1, 1, 3, 3, 3, 3]}, + 17: {'class_prob': 0.004867076873779297, + 'count': 1016, + 'possible_values': {'1x1 5x2 2x3', + '2x1 3x2 3x3', + '3x1 1x2 4x3', + '7x2 1x3'}, + 'world_ex': [1, 1, 1, 2, 3, 3, 3, 3]}, + 18: {'class_prob': 0.0017240047454833984, + 'count': 784, + 'possible_values': {'3x1 5x3', '6x2 2x3', '1x1 4x2 3x3', '2x1 2x2 4x3'}, + 'world_ex': [1, 1, 1, 3, 3, 3, 3, 3]}, + 19: {'class_prob': 0.0004639625549316406, + 'count': 504, + 'possible_values': {'5x2 3x3', '2x1 1x2 5x3', '1x1 3x2 4x3'}, + 'world_ex': [1, 1, 2, 3, 3, 3, 3, 3]}, + 20: {'class_prob': 0.00012433528900146484, + 'count': 266, + 'possible_values': {'2x1 6x3', '1x1 2x2 5x3', '4x2 4x3'}, + 'world_ex': [1, 1, 3, 3, 3, 3, 3, 3]}, + 21: {'class_prob': 2.3365020751953125e-05, + 'count': 112, + 'possible_values': {'3x2 5x3', '1x1 1x2 6x3'}, + 'world_ex': [1, 2, 3, 3, 3, 3, 3, 3]}, + 22: {'class_prob': 4.5299530029296875e-06, + 'count': 36, + 'possible_values': {'2x2 6x3', '1x1 7x3'}, + 'world_ex': [1, 3, 3, 3, 3, 3, 3, 3]}, + 23: {'class_prob': 4.76837158203125e-07, + 'count': 8, + 'possible_values': {'1x2 7x3'}, + 'world_ex': [2, 3, 3, 3, 3, 3, 3, 3]}, + 24: {'class_prob': 5.960464477539063e-08, + 'count': 1, + 'possible_values': {'8x3'}, + 'world_ex': [3, 3, 3, 3, 3, 3, 3, 3]}} +#+end_example + +#+BEGIN_src jupyter-python :results output + # returns the class with the highest probability + def most_probable_classes(classes): + keys = [] + prob = 0 + for k in classes: + if prob == classes[k]["class_prob"]: + keys.append(k) + if prob < classes[k]["class_prob"]: + keys = [k] + prob = classes[k]["class_prob"] + return { "keys": keys, "prob": prob } + + # compute the most probable answers for a given aggregate function + def most_probable_ans(dist, n, agg): + classes = world_classes(dist, n, agg) + answers = [] + mc = most_probable_classes(classes) + for k in mc["keys"]: + answers.append({ "ans": k, "prob": mc["prob"], "possible_values": classes[k]["possible_values"] }) + return answers + + print(most_probable_ans(dist, n, sum)) +#+END_src + +#+RESULTS: +: [{'ans': 10, 'prob': 0.2113494873046875, 'possible_values': {'7x1 1x3', '6x1 2x2'}}] + +#+BEGIN_src jupyter-python + # compute the answer the most correct answer for a given aggregate function + def most_correct_ans(dist, n, agg): + classes = world_classes(dist, n) + answers = [] + mc = most_probable_classes(classes) + for k in mc["keys"]: + world_of_most_probable_class = classes[k]["world_ex"] + answers.append({ "ans": agg(world_of_most_probable_class), "prob": mc["prob"], "possible_values": k }) + return answers + + + print(most_correct_ans(dist, n, sum)) +#+END_src + +#+RESULTS: +: [{'ans': 11, 'prob': 0.155731201171875, 'possible_values': '6x1 1x2 1x3'}] + + +The case with another distribution. It was almost a good example, but there are two most probable answers : +#+BEGIN_src jupyter-python + dist = { + 0 : 0.5, + 1 : 0.25, + 2 : 0.25 + } + n = 4 + + print(most_correct_ans(dist, n, sum)) + print(most_probable_ans(dist, n, sum)) +#+END_src + +#+RESULTS: +: [{'ans': 3, 'prob': 0.1875, 'possible_values': '2x0 1x1 1x2'}] +: [{'ans': 2, 'prob': 0.21875, 'possible_values': {'2x0 2x1', '3x0 1x2'}}, {'ans': 3, 'prob': 0.21875, 'possible_values': {'1x0 3x1', '2x0 1x1 1x2'}}] + +#+BEGIN_src jupyter-python + import statistics + dist = { + 1 : 0.75, + 2 : 0.125, + 3 : 0.125 + } + n = 8 + print(most_correct_ans(dist, n, statistics.median)) +#+END_src + +#+RESULTS: +: [{'ans': 1.0, 'prob': 0.8861846923828125, 'possible_values': {'6x1 1x2 1x3', '5x1 3x3', '8x1', '5x1 3x2', '5x1 1x2 2x3', '7x1 1x2', '5x1 2x2 1x3', '6x1 2x2', '7x1 1x3', '6x1 2x3'}}] diff --git a/projects/missingdata/index.org b/projects/missingdata/index.org new file mode 100644 index 0000000000000000000000000000000000000000..256b08dfc7a59517b43e75002a93462602cf5440 --- /dev/null +++ b/projects/missingdata/index.org @@ -0,0 +1,19 @@ +#+TITLE: Databases with missing data + +The aim of this project is to study the query answering over database with missing data, where the missingness is described by a graph of missingness. + +* Query answering over block dependent probabilistic databases + +The different notions of query answering for a numerical query q (including Boolean queries: 0 or 1) over a BIPDB D: + +- the *expect value* defined by $E(q(D))$ +- a *most probable answer* is an possible answer having the highest probability +- a *best answer* is an answer on a most probable distribution of the tuples. In this case, the possible worlds that have the same distribution of tuples are considered as equivalent : we say that they form a *class*. + +The answer of a CQ over BIPDB should be another BIPDB. + +** Open questions + +- The [[file:best-answer-vs-most-probable.org][comparison of the best answer and the most probable answer]] shows that the two notions are different on a small example. +- The best answer and the expect value are the same notion when the number of rows in D is such that for every probabilities p, $|D| \times p$ is an integer ? It leads us to another question. In this case, is the class of possible worlds where the tuples is compliant with the distribution the most probable class ? I started to work on those questions [[file:most-probable-class.org][here]]. + diff --git a/projects/missingdata/most-probable-class.org b/projects/missingdata/most-probable-class.org new file mode 100644 index 0000000000000000000000000000000000000000..bf91d94a64292e5aede0031d9de3cf9e45cbffa3 --- /dev/null +++ b/projects/missingdata/most-probable-class.org @@ -0,0 +1,94 @@ +#+TITLE: Which is the most probable class ? +#+PROPERTY: header-args :session most-prob-class :exports both :results output :tangle yes +#+OPTIONS: toc:nil + +* Theoretical result + +We consider the case of a random variable $X$ with a finite range $\{v_{1}, \dots, v_{m}\}$ and there exists an minimal integer $Z$ such that $P(X=v_{i}) = \frac{u_{i}}{Z}$. So, we have $\sum_{1\leq i \leq m} u_{i} = Z$. + +We perform $n$ independent draws of $X$, the probability of obtaining $k_{i}$ times the values $v_{i}$ with $\sum_{1\leq i \leq m} k_{i} = n$ is: +$$\binom{n}{k_{1}} (\frac{u_{1}}{Z})^{k_{1}} \times \binom{n - k_{1}}{k_{2}} (\frac{u_{2}}{Z})^{k_{2}} \dots \times \binom{n - k_{1} \dots - k_{m-1}}{k_{m}} (\frac{u_{m}}{Z})^{k_{m}}$$ + +We can simply the formula to obtain: +$$\frac{n!}{Z^{n}} \prod_{1\leq i \leq m} \frac{u_{i}^{k_{i}}}{k_{i}!}$$ + +Finding the set of values $k_{i}$ that maximize the above formula is equivalent to find for each $i$ the $k_{i}$ maximizing $\frac{u_{i}^{k_{i}}}{k_{i}!}$. According the following section, the maximum is reached when $k_{i} = u_{i}$. However the additional constraint $\sum_{1\leq i \leq m} k_{i} = n$ ensures that the choice $k_{i} = u_{i}$ is possible iff $n$ is a multiple of $Z$. + + +* Analyze of u^k/k! + +In the following, we observe that the maximum of $\frac{u^{k}}{k!}$ for fixed $u$ seems to be reached when $k=u$. + +#+BEGIN_src jupyter-python + import matplotlib.pyplot as plt + import numpy as np + n = 50 + + k = np.arange(0, n) + k[0] = 1 # fact(0) + u = np.repeat(np.arange(0, n), n).reshape((n, n)) + uoverk = np.divide(u, k) + uoverk[:,0] = 1 # u^0 =1 + res = np.cumprod(uoverk, axis=1) + normalized_res = res/res.max(axis=1)[:,None] +#+END_src + +#+RESULTS: + + +#+BEGIN_src jupyter-python + fig, axis = plt.subplots() # il me semble que c'est une bonne habitude de faire supbplots + heatmap = axis.pcolor(normalized_res, cmap=plt.cm.Blues) # heatmap contient les valeurs + plt.colorbar(heatmap) + plt.xlabel("k") + plt.ylabel("u", rotation=0) + plt.title("u^k/k! normalized for fixed u") + plt.show() +#+END_src + +#+RESULTS: +[[file:./.ob-jupyter/e16c9d053b3952bf48300ded8b9cfea3a1e6e881.png]] + + +We just have to write the following equation : + +$$\frac{u^n}{n!} = \frac{u}{1} \times \frac{u}{2} \dots \times \frac{u}{n}$$ + +* 6k draws of a dice +:PROPERTIES: +:CUSTOM_ID: dice +:END: + +We choose for a given $k$, $n=6k$, $Z=6k$, $m=6$, $u_i=k$, so the $k_i$ have to be equal to $k$ to maximize the probability and the formula becomes : + +$$\frac{(6k)!}{(6k)^{6k}} \prod_{1\leq i \leq 6} \frac{k^k}{k!} = \frac{(6k)!}{6^{6k} (k!)^6}$$ + +#+BEGIN_src jupyter-python + def most_prob_class(k): + prob = 1 + for i in range(1, 6*k +1): + d = i % k if (i % k) != 0 else k + prob = prob * (i/(6*d)) + return prob + + print([most_prob_class(1), most_prob_class(10), most_prob_class(100), most_prob_class(300)]) +#+END_src + +#+RESULTS: +: [0.015432098765432098, 7.456270054665195e-05, 2.4632858255234786e-07, 1.5853278892898133e-08] + +* TODO Comparison of expected value and best answer + +In general, a PDB is a triplet $(\mathcal D, \mathcal W, P)$ where $\mathcal D$ is the possibly infinite set of possible tuples, $\mathcal W$ is a $\sigma$ algebra on $\mathcal D$, it represents the set of the possible database instances, so every member of $\mathcal W$ is a finite set and $P$ is a probability over $\mathcal W$. + +How to define the union or intersection of two instances in $\mathcal W$ with the bag semantic ? + +How to define an independent block PDB as a PDB from the probabilities of the values in each block ? It should be easy. Is the order of the tuples taken into account ? + +Finally, how to relate the previous results with the probabilities of BIDPDB ? + +For $Q$ a given numerical query and a $(\mathcal D, \mathcal W, P)$ a PDB, the /expected value/ of $Q$ on $(\mathcal D, \mathcal W, P)$ is defined by: +$$E(Q(D)) = \int_{D \in \mathcal W} Q(D) dP$$ + + + diff --git a/projects/missingdata/most-probable-class.tex b/projects/missingdata/most-probable-class.tex new file mode 100644 index 0000000000000000000000000000000000000000..2192bbe89f3cfecb1153a03a818417b9eaff36d2 --- /dev/null +++ b/projects/missingdata/most-probable-class.tex @@ -0,0 +1,68 @@ +% Created 2023-06-09 ven. 18:11 +% Intended LaTeX compiler: pdflatex +\documentclass[11pt]{article} +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{graphicx} +\usepackage{longtable} +\usepackage{wrapfig} +\usepackage{rotating} +\usepackage[normalem]{ulem} +\usepackage{amsmath} +\usepackage{amssymb} +\usepackage{capt-of} +\usepackage{hyperref} +\author{Maxime Buron} +\date{\today} +\title{Which is the most probable class ?} +\hypersetup{ + pdfauthor={Maxime Buron}, + pdftitle={Which is the most probable class ?}, + pdfkeywords={}, + pdfsubject={}, + pdfcreator={Emacs 28.2 (Org mode 9.5.5)}, + pdflang={English}} +\begin{document} + +\maketitle + +\section{Theoretical result} +\label{sec:org8ad4e4a} + +We consider the case of a random variable \(X\) with a finite range \(\{v_{1}, \dots, v_{m}\}\) and there exists an minimal integer \(Z\) such that \(P(X=v_{i}) = \frac{u_{i}}{Z}\). So, we have \(\sum_{1\leq i \leq m} u_{i} = Z\). + +We perform \(n\) independent draws of \(X\), the probability of obtaining \(k_{i}\) times the values \(v_{i}\) with \(\sum_{1\leq i \leq m} k_{i} = n\) is: +$$\binom{n}{k_{1}} (\frac{u_{1}}{Z})^{k_{1}} \times \binom{n - k_{1}}{k_{2}} (\frac{u_{2}}{Z})^{k_{2}} \dots \times \binom{n - k_{1} \dots - k_{m-1}}{k_{m}} (\frac{u_{m}}{Z})^{k_{m}}$$ + +We can simply the formula to obtain: +$$\frac{n!}{Z^{m}} \prod_{1\leq i \leq m} \frac{u_{i}^{k_{i}}}{k_{i}!}$$ + +Finding the set of values \(k_{i}\) that maximize the above formula is equivalent to find for each \(i\) the \(k_{i}\) maximizing \(\frac{u_{i}^{k_{i}}}{k_{i}!}\). According the following section, the maximum is reached when \(k_{i} = u_{i}\). However the additional constraint \(\sum_{1\leq i \leq m} k_{i} = n\) ensures that the choice \(k_{i} = u_{i}\) is possible iff \(n\) is a multiple of \(Z\). + + +\section{Analyze of u\textsuperscript{k}/k!} +\label{sec:org338bec8} + +In the following, we observe that the maximum of \(\frac{u^{k}}{k!}\) for fixed \(u\) seems to be reached when \(k=u\). + +\begin{center} +\includegraphics[width=.9\linewidth]{./.ob-jupyter/e16c9d053b3952bf48300ded8b9cfea3a1e6e881.png} +\end{center} + + +TODO: theoretically show the result with the sign of \(\frac{u^{k+1}}{(k+1)!} - \frac{u^{k}}{k!}\) + +\section{{\bfseries\sffamily TODO} Comparison of expected value and best answer} +\label{sec:org41aad11} + +In general, a PDB is a triplet \((\mathcal D, \mathcal W, P)\) where \(\mathcal D\) is the possibly infinite set of possible tuples, \(\mathcal W\) is a \(\sigma\) algebra on \(\mathcal D\), it represents the set of the possible database instances, so every member of \(\mathcal W\) is a finite set and \(P\) is a probability over \(\mathcal W\). + +How to define the union or intersection of two instances in \(\mathcal W\) with the bag semantic ? + +How to define an independent block PDB as a PDB from the probabilities of the values in each block ? It should be easy. Is the order of the tuples taken into account ? + +Finally, how to relate the previous results with the probabilities of BIDPDB ? + +For \(Q\) a given numerical query and a \((\mathcal D, \mathcal W, P)\) a PDB, the \emph{expected value} of \(Q\) on \((\mathcal D, \mathcal W, P)\) is defined by: +$$E(Q(D)) = \int_{D \in \mathcal W} Q(D) dP$$ +\end{document} \ No newline at end of file