diff --git a/Data.txt b/Data.txt new file mode 100644 index 0000000..351f992 --- /dev/null +++ b/Data.txt @@ -0,0 +1,9 @@ +I1,I2,I5 +I2,I4 +I2,I3 +I1,I2,I4 +I1,I3 +I2,I3 +I1,I3 +I1,I2,I3,I5 +I1,I2,I3 \ No newline at end of file diff --git a/TEST.md b/TEST.md new file mode 100644 index 0000000..5f13818 --- /dev/null +++ b/TEST.md @@ -0,0 +1,2 @@ +This is for testing purposes only +Testing the commit signing key diff --git a/apriori.ipynb b/apriori.ipynb new file mode 100644 index 0000000..f1a221d --- /dev/null +++ b/apriori.ipynb @@ -0,0 +1,238 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "initial_id", + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from utils import *\n", + "from functions import load_transaction_data, get_frequent,join_set_itemsets,count_occurences,write_rules,powerset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "38d9e47d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9\n" + ] + } + ], + "source": [ + "path_to_data = 'Data.txt'\n", + "min_support = 2/9\n", + "min_confidence = 0.3 # 30 percent\n", + "order = ['I' + str(i) for i in range(1,6)]\n", + "#print(order)\n", + "\n", + "with open(path_to_data, \"r\") as file:\n", + " data = file.read()\n", + "transactions = data.split(\"\\n\")\n", + "num_trans = len(transactions)\n", + "\n", + "print(num_trans)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "759093e0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "L1 \n", + "\n", + "([['I1'], ['I2'], ['I3'], ['I4'], ['I5']], [6, 7, 6, 2, 2])\n" + ] + } + ], + "source": [ + "Transactions = load_transaction_data(path_to_data, order)\n", + "\n", + "#innitialization\n", + "#for itemsets of 1\n", + "C = {}\n", + "L = {}\n", + "itemset_size = 1\n", + "Discarded = {itemset_size : []}\n", + "C.update({itemset_size : [ [f] for f in order ]})\n", + "\n", + "#creating L1 (Frequent itemsets)\n", + "supp_cnt_L = {}\n", + "f, sup, new_discarded = get_frequent(C[itemset_size], Transactions, min_support, Discarded)\n", + "Discarded.update({ itemset_size: new_discarded}) #updates to discarded itemsets from first iteration\n", + "L.update({itemset_size : f})\n", + "supp_cnt_L.update({itemset_size : sup})\n", + "\n", + "result = (L[1], supp_cnt_L[1])\n", + "print(\"L1 \\n\")\n", + "print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "50c396b0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Result C2 \n", + "\n", + "([['I1', 'I2'], ['I1', 'I3'], ['I1', 'I4'], ['I1', 'I5'], ['I2', 'I3'], ['I2', 'I4'], ['I2', 'I5'], ['I3', 'I4'], ['I3', 'I5'], ['I4', 'I5']], [4, 4, 1, 2, 4, 2, 2, 0, 1, 0])\n", + "\n", + "Result L2 \n", + "\n", + "([['I1', 'I2'], ['I1', 'I3'], ['I1', 'I5'], ['I2', 'I3'], ['I2', 'I4'], ['I2', 'I5']], [4, 4, 2, 4, 2, 2])\n", + "\n", + "Result C3 \n", + "\n", + "([['I1', 'I2', 'I3'], ['I1', 'I2', 'I5'], ['I1', 'I3', 'I5'], ['I2', 'I3', 'I4'], ['I2', 'I3', 'I5'], ['I2', 'I4', 'I5']], [2, 2, 1, 0, 1, 0])\n", + "\n", + "Result L3 \n", + "\n", + "([['I1', 'I2', 'I3'], ['I1', 'I2', 'I5']], [2, 2])\n", + "\n", + "Result C4 \n", + "\n", + "([['I1', 'I2', 'I3', 'I5']], [1])\n", + "\n" + ] + } + ], + "source": [ + "k = itemset_size + 1\n", + "convergence = False\n", + "while not convergence:\n", + " C.update({k:join_set_itemsets(L[k-1], order)})\n", + " print(\"Result C{} \\n\".format(k))\n", + " result = C[k], [count_occurences(it, Transactions) for it in C[k]]\n", + " print (result)\n", + " print()\n", + " f, sup, new_discarded = get_frequent(C[k],Transactions, min_support, Discarded)\n", + " Discarded.update({k : new_discarded})\n", + " L.update({k : f})\n", + " supp_cnt_L.update({k : sup})\n", + " if len(L[k]) == 0:\n", + " convergence = True\n", + " else:\n", + " print(\"Result L{} \\n\".format(k))\n", + " result = L[k], supp_cnt_L[k]\n", + " print(result)\n", + " print()\n", + " k += 1" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "101df4e0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Freq. Itemset: {'I2', 'I1'}\n", + " Rule: ['I2'] -> ['I1'] \n", + " conf: 0.571 supp: 0.444 lift: 0.857 Freq. Itemset: {'I2', 'I1'}\n", + " Rule: ['I1'] -> ['I2'] \n", + " conf: 0.667 supp: 0.444 lift: 0.857 Freq. Itemset: {'I3', 'I1'}\n", + " Rule: ['I3'] -> ['I1'] \n", + " conf: 0.667 supp: 0.444 lift: 1.000 Freq. Itemset: {'I3', 'I1'}\n", + " Rule: ['I1'] -> ['I3'] \n", + " conf: 0.667 supp: 0.444 lift: 1.000 Freq. Itemset: {'I5', 'I1'}\n", + " Rule: ['I5'] -> ['I1'] \n", + " conf: 1.000 supp: 0.222 lift: 1.500 Freq. Itemset: {'I5', 'I1'}\n", + " Rule: ['I1'] -> ['I5'] \n", + " conf: 0.333 supp: 0.222 lift: 1.500 Freq. Itemset: {'I3', 'I2'}\n", + " Rule: ['I3'] -> ['I2'] \n", + " conf: 0.667 supp: 0.444 lift: 0.857 Freq. Itemset: {'I3', 'I2'}\n", + " Rule: ['I2'] -> ['I3'] \n", + " conf: 0.571 supp: 0.444 lift: 0.857 Freq. Itemset: {'I4', 'I2'}\n", + " Rule: ['I4'] -> ['I2'] \n", + " conf: 1.000 supp: 0.222 lift: 1.286 Freq. Itemset: {'I5', 'I2'}\n", + " Rule: ['I5'] -> ['I2'] \n", + " conf: 1.000 supp: 0.222 lift: 1.286 Freq. Itemset: {'I3', 'I2', 'I1'}\n", + " Rule: ['I3'] -> ['I2', 'I1'] \n", + " conf: 0.333 supp: 0.222 lift: 0.750 Freq. Itemset: {'I3', 'I2', 'I1'}\n", + " Rule: ['I1'] -> ['I3', 'I2'] \n", + " conf: 0.333 supp: 0.222 lift: 0.750 Freq. Itemset: {'I3', 'I2', 'I1'}\n", + " Rule: ['I3', 'I2'] -> ['I1'] \n", + " conf: 0.500 supp: 0.222 lift: 0.750 Freq. Itemset: {'I3', 'I2', 'I1'}\n", + " Rule: ['I3', 'I1'] -> ['I2'] \n", + " conf: 0.500 supp: 0.222 lift: 0.643 Freq. Itemset: {'I3', 'I2', 'I1'}\n", + " Rule: ['I2', 'I1'] -> ['I3'] \n", + " conf: 0.500 supp: 0.222 lift: 0.750 Freq. Itemset: {'I5', 'I2', 'I1'}\n", + " Rule: ['I5'] -> ['I2', 'I1'] \n", + " conf: 1.000 supp: 0.222 lift: 2.250 Freq. Itemset: {'I5', 'I2', 'I1'}\n", + " Rule: ['I1'] -> ['I5', 'I2'] \n", + " conf: 0.333 supp: 0.222 lift: 1.500 Freq. Itemset: {'I5', 'I2', 'I1'}\n", + " Rule: ['I5', 'I2'] -> ['I1'] \n", + " conf: 1.000 supp: 0.222 lift: 1.500 Freq. Itemset: {'I5', 'I2', 'I1'}\n", + " Rule: ['I5', 'I1'] -> ['I2'] \n", + " conf: 1.000 supp: 0.222 lift: 1.286 Freq. Itemset: {'I5', 'I2', 'I1'}\n", + " Rule: ['I2', 'I1'] -> ['I5'] \n", + " conf: 0.500 supp: 0.222 lift: 2.250 \n" + ] + } + ], + "source": [ + "assoc_rules_str = \"\"\n", + "\n", + "for i in range(1, len(L)):\n", + " for j in range (len(L[i])):\n", + " s = powerset(set(L[i][j]))\n", + " s.pop() #subset containing all the elements will be gotten rid of\n", + " for z in s:\n", + " S = set(z)\n", + " X = set(L[i][j])\n", + " X_S = set(X-S)\n", + " sup_x = count_occurences(X, Transactions)\n", + " sup_x_s = count_occurences(X_S, Transactions)\n", + " conf = sup_x / count_occurences(S, Transactions)\n", + " lift = conf / (sup_x_s/num_trans)\n", + " if conf >= min_confidence and sup_x >= min_support:\n", + " assoc_rules_str += write_rules(X, X_S, S, conf, sup_x, lift, num_trans)\n", + "\n", + "print(assoc_rules_str)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/apriori.py b/apriori.py new file mode 100644 index 0000000..42f48a7 --- /dev/null +++ b/apriori.py @@ -0,0 +1,79 @@ +import numpy as np +from utils import * +from functions import load_transaction_data, get_frequent,join_set_itemsets,count_occurences,write_rules,powerset + + +path_to_data = 'Data.txt' +min_support = 2/9 +min_confidence = 0.3 # 30 percent +order = ['I' + str(i) for i in range(1,6)] +#print(order) + +with open(path_to_data, "r") as file: + data = file.read() +transactions = data.split("\n") +num_trans = len(transactions) +#print(num_trans) + +Transactions = load_transaction_data(path_to_data, order) + +#innitialization +#for itemsets of 1 +C = {} +L = {} +itemset_size = 1 +Discarded = {itemset_size : []} +C.update({itemset_size : [ [f] for f in order ]}) + +#creating L1 (Frequent itemsets) +supp_cnt_L = {} +f, sup, new_discarded = get_frequent(C[itemset_size], Transactions, min_support, Discarded) +Discarded.update({ itemset_size: new_discarded}) #real discarded itemsets from first iteration +L.update({itemset_size : f}) +supp_cnt_L.update({itemset_size : sup}) + +result = (L[1], supp_cnt_L[1]) + +k = itemset_size + 1 +convergence = False +while not convergence: + C.update({k:join_set_itemsets(L[k-1], order)}) + print("Result C{} \n".format(k)) + tresult= C[k], [count_occurences(it, Transactions) for it in C[k]] + print (result) + print() + f, sup, new_discarded = get_frequent(C[k],Transactions, min_support, Discarded) + Discarded.update({k : new_discarded}) + L.update({k : f}) + supp_cnt_L.update({k : sup}) + if len(L[k]) == 0: + convergence = True + else: + print("Table L{} \n".format(k)) + result = L[k], supp_cnt_L[k] + print(result) + print() + k += 1 + +#Generating association rules according to frequent itemsets above + +#starting from frequent sets of size 2 ie (k = 1), since frequent sets of size one can't produce significant rules + +assoc_rules_str = "" + +for i in range(1, len(L)): + for j in range (len(L[i])): + s = powerset(set(L[i][j])) + s.pop() #subset containing all the elements will be gotten rid of + for z in s: + S = set(z) + X = set(L[i][j]) + X_S = set(X-S) + sup_x = count_occurences(X, Transactions) + sup_x_s = count_occurences(X_S, Transactions) + conf = sup_x / count_occurences(S, Transactions) + lift = conf / (sup_x_s/num_trans) + if conf >= min_confidence and sup_x >= min_support: + assoc_rules_str += write_rules(X, X_S, S, conf, sup_x, lift, num_trans) + +print(assoc_rules_str) diff --git a/functions.py b/functions.py new file mode 100644 index 0000000..4bb8a35 --- /dev/null +++ b/functions.py @@ -0,0 +1,84 @@ +import numpy as np +from itertools import combinations, chain + +def count_occurences(itemset, transaction): + count = 0 + for i in range(len(transaction)): + if set(itemset).issubset(set(transaction[i])): + count += 1 + return count + +def join_two_itemsets(it1, it2, order): + it1.sort(key=lambda x: order.index(x)) + it2.sort(key=lambda x: order.index(x)) + + for i in range(len(it1)-1): + if it1[i] != it2[i]: + return + if order.index(it1[-1]) < order.index(it2[-1]): + return it1 + [it2[-1]] + return [] + +def join_set_itemsets(set_of_its, order): + C = [] + for i in range(len(set_of_its)): + for j in range(i+1, len(set_of_its)): + it_out = join_two_itemsets(set_of_its[i], set_of_its[j], order) + if it_out is not None and len(it_out) > 0: + C.append(it_out) + return C + + +def load_transaction_data(path_of_data, order): + transaction = [] + with open(path_of_data, 'r') as fid: + for lines in fid: + str_line = list(lines.strip().split(',')) + #raegardless of how many times an items appears it is treated as one in a transaction + _t = list(np.unique(str_line)) + #sorting results in list in lexical order + _t.sort(key=lambda x: order.index(x)) + transaction.append(_t) + return transaction + + +def get_frequent(itemsets, transaction, min_support, prev_discarded): + L =[] #list of frequet itemsets + supp_count = [] #count of the itemset + new_discarded = [] + + + k = len(prev_discarded.keys()) + for s in range (len(itemsets)): + #check if an itemset was previously discarded before checking the frequency + discarded_before = False + if k > 0: + #checking for subsets of previosly discarded itemsets + for it in prev_discarded[k]: + if set(it).issubset(set(itemsets[s])): + discarded_before = True + break + if not discarded_before: + count = count_occurences(itemsets[s], transaction) + if count/len(transaction)>= min_support: + L.append(itemsets[s]) + supp_count.append(count) + else: + new_discarded.append(itemsets[s]) + return L, supp_count, new_discarded + + +#Generating association rules according to frequent itemsets above + +#starting from frequent sets of size 2 ie (k = 1), since frequent sets of size one can't produce significant rules +def powerset(s): + return list(chain.from_iterable(combinations(s, r) for r in range(1, len(s) + 1))) + +def write_rules(X, X_S, S, conf, supp, lift, num_trans): + out_rules = "" + out_rules += "Freq. Itemset: {}\n".format(X) + out_rules += " Rule: {} -> {} \n".format(list(S), list(X_S)) + out_rules += " conf: {0:2.3f} ".format(conf) + out_rules += " supp: {0:2.3f} ".format(supp / num_trans) + out_rules += " lift: {0:2.3f} ".format(lift) + return out_rules \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index f13a33d..10a8b19 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ panda -notebook \ No newline at end of file +notebooknumpy==1.26.2 +utils==1.0.1