Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement apriori #12

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Data.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
I1,I2,I5
I2,I4
I2,I3
I1,I2,I4
I1,I3
I2,I3
I1,I3
I1,I2,I3,I5
I1,I2,I3
2 changes: 2 additions & 0 deletions TEST.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
This is for testing purposes only
Testing the commit signing key
238 changes: 238 additions & 0 deletions apriori.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "initial_id",
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"from utils import *\n",
"from functions import load_transaction_data, get_frequent,join_set_itemsets,count_occurences,write_rules,powerset"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "38d9e47d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"9\n"
]
}
],
"source": [
"path_to_data = 'Data.txt'\n",
"min_support = 2/9\n",
"min_confidence = 0.3 # 30 percent\n",
"order = ['I' + str(i) for i in range(1,6)]\n",
"#print(order)\n",
"\n",
"with open(path_to_data, \"r\") as file:\n",
" data = file.read()\n",
"transactions = data.split(\"\\n\")\n",
"num_trans = len(transactions)\n",
"\n",
"print(num_trans)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "759093e0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"L1 \n",
"\n",
"([['I1'], ['I2'], ['I3'], ['I4'], ['I5']], [6, 7, 6, 2, 2])\n"
]
}
],
"source": [
"Transactions = load_transaction_data(path_to_data, order)\n",
"\n",
"#innitialization\n",
"#for itemsets of 1\n",
"C = {}\n",
"L = {}\n",
"itemset_size = 1\n",
"Discarded = {itemset_size : []}\n",
"C.update({itemset_size : [ [f] for f in order ]})\n",
"\n",
"#creating L1 (Frequent itemsets)\n",
"supp_cnt_L = {}\n",
"f, sup, new_discarded = get_frequent(C[itemset_size], Transactions, min_support, Discarded)\n",
"Discarded.update({ itemset_size: new_discarded}) #updates to discarded itemsets from first iteration\n",
"L.update({itemset_size : f})\n",
"supp_cnt_L.update({itemset_size : sup})\n",
"\n",
"result = (L[1], supp_cnt_L[1])\n",
"print(\"L1 \\n\")\n",
"print(result)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "50c396b0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Result C2 \n",
"\n",
"([['I1', 'I2'], ['I1', 'I3'], ['I1', 'I4'], ['I1', 'I5'], ['I2', 'I3'], ['I2', 'I4'], ['I2', 'I5'], ['I3', 'I4'], ['I3', 'I5'], ['I4', 'I5']], [4, 4, 1, 2, 4, 2, 2, 0, 1, 0])\n",
"\n",
"Result L2 \n",
"\n",
"([['I1', 'I2'], ['I1', 'I3'], ['I1', 'I5'], ['I2', 'I3'], ['I2', 'I4'], ['I2', 'I5']], [4, 4, 2, 4, 2, 2])\n",
"\n",
"Result C3 \n",
"\n",
"([['I1', 'I2', 'I3'], ['I1', 'I2', 'I5'], ['I1', 'I3', 'I5'], ['I2', 'I3', 'I4'], ['I2', 'I3', 'I5'], ['I2', 'I4', 'I5']], [2, 2, 1, 0, 1, 0])\n",
"\n",
"Result L3 \n",
"\n",
"([['I1', 'I2', 'I3'], ['I1', 'I2', 'I5']], [2, 2])\n",
"\n",
"Result C4 \n",
"\n",
"([['I1', 'I2', 'I3', 'I5']], [1])\n",
"\n"
]
}
],
"source": [
"k = itemset_size + 1\n",
"convergence = False\n",
"while not convergence:\n",
" C.update({k:join_set_itemsets(L[k-1], order)})\n",
" print(\"Result C{} \\n\".format(k))\n",
" result = C[k], [count_occurences(it, Transactions) for it in C[k]]\n",
" print (result)\n",
" print()\n",
" f, sup, new_discarded = get_frequent(C[k],Transactions, min_support, Discarded)\n",
" Discarded.update({k : new_discarded})\n",
" L.update({k : f})\n",
" supp_cnt_L.update({k : sup})\n",
" if len(L[k]) == 0:\n",
" convergence = True\n",
" else:\n",
" print(\"Result L{} \\n\".format(k))\n",
" result = L[k], supp_cnt_L[k]\n",
" print(result)\n",
" print()\n",
" k += 1"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "101df4e0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Freq. Itemset: {'I2', 'I1'}\n",
" Rule: ['I2'] -> ['I1'] \n",
" conf: 0.571 supp: 0.444 lift: 0.857 Freq. Itemset: {'I2', 'I1'}\n",
" Rule: ['I1'] -> ['I2'] \n",
" conf: 0.667 supp: 0.444 lift: 0.857 Freq. Itemset: {'I3', 'I1'}\n",
" Rule: ['I3'] -> ['I1'] \n",
" conf: 0.667 supp: 0.444 lift: 1.000 Freq. Itemset: {'I3', 'I1'}\n",
" Rule: ['I1'] -> ['I3'] \n",
" conf: 0.667 supp: 0.444 lift: 1.000 Freq. Itemset: {'I5', 'I1'}\n",
" Rule: ['I5'] -> ['I1'] \n",
" conf: 1.000 supp: 0.222 lift: 1.500 Freq. Itemset: {'I5', 'I1'}\n",
" Rule: ['I1'] -> ['I5'] \n",
" conf: 0.333 supp: 0.222 lift: 1.500 Freq. Itemset: {'I3', 'I2'}\n",
" Rule: ['I3'] -> ['I2'] \n",
" conf: 0.667 supp: 0.444 lift: 0.857 Freq. Itemset: {'I3', 'I2'}\n",
" Rule: ['I2'] -> ['I3'] \n",
" conf: 0.571 supp: 0.444 lift: 0.857 Freq. Itemset: {'I4', 'I2'}\n",
" Rule: ['I4'] -> ['I2'] \n",
" conf: 1.000 supp: 0.222 lift: 1.286 Freq. Itemset: {'I5', 'I2'}\n",
" Rule: ['I5'] -> ['I2'] \n",
" conf: 1.000 supp: 0.222 lift: 1.286 Freq. Itemset: {'I3', 'I2', 'I1'}\n",
" Rule: ['I3'] -> ['I2', 'I1'] \n",
" conf: 0.333 supp: 0.222 lift: 0.750 Freq. Itemset: {'I3', 'I2', 'I1'}\n",
" Rule: ['I1'] -> ['I3', 'I2'] \n",
" conf: 0.333 supp: 0.222 lift: 0.750 Freq. Itemset: {'I3', 'I2', 'I1'}\n",
" Rule: ['I3', 'I2'] -> ['I1'] \n",
" conf: 0.500 supp: 0.222 lift: 0.750 Freq. Itemset: {'I3', 'I2', 'I1'}\n",
" Rule: ['I3', 'I1'] -> ['I2'] \n",
" conf: 0.500 supp: 0.222 lift: 0.643 Freq. Itemset: {'I3', 'I2', 'I1'}\n",
" Rule: ['I2', 'I1'] -> ['I3'] \n",
" conf: 0.500 supp: 0.222 lift: 0.750 Freq. Itemset: {'I5', 'I2', 'I1'}\n",
" Rule: ['I5'] -> ['I2', 'I1'] \n",
" conf: 1.000 supp: 0.222 lift: 2.250 Freq. Itemset: {'I5', 'I2', 'I1'}\n",
" Rule: ['I1'] -> ['I5', 'I2'] \n",
" conf: 0.333 supp: 0.222 lift: 1.500 Freq. Itemset: {'I5', 'I2', 'I1'}\n",
" Rule: ['I5', 'I2'] -> ['I1'] \n",
" conf: 1.000 supp: 0.222 lift: 1.500 Freq. Itemset: {'I5', 'I2', 'I1'}\n",
" Rule: ['I5', 'I1'] -> ['I2'] \n",
" conf: 1.000 supp: 0.222 lift: 1.286 Freq. Itemset: {'I5', 'I2', 'I1'}\n",
" Rule: ['I2', 'I1'] -> ['I5'] \n",
" conf: 0.500 supp: 0.222 lift: 2.250 \n"
]
}
],
"source": [
"assoc_rules_str = \"\"\n",
"\n",
"for i in range(1, len(L)):\n",
" for j in range (len(L[i])):\n",
" s = powerset(set(L[i][j]))\n",
" s.pop() #subset containing all the elements will be gotten rid of\n",
" for z in s:\n",
" S = set(z)\n",
" X = set(L[i][j])\n",
" X_S = set(X-S)\n",
" sup_x = count_occurences(X, Transactions)\n",
" sup_x_s = count_occurences(X_S, Transactions)\n",
" conf = sup_x / count_occurences(S, Transactions)\n",
" lift = conf / (sup_x_s/num_trans)\n",
" if conf >= min_confidence and sup_x >= min_support:\n",
" assoc_rules_str += write_rules(X, X_S, S, conf, sup_x, lift, num_trans)\n",
"\n",
"print(assoc_rules_str)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
79 changes: 79 additions & 0 deletions apriori.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import numpy as np
from utils import *
from functions import load_transaction_data, get_frequent,join_set_itemsets,count_occurences,write_rules,powerset


path_to_data = 'Data.txt'
min_support = 2/9
min_confidence = 0.3 # 30 percent
order = ['I' + str(i) for i in range(1,6)]
#print(order)

with open(path_to_data, "r") as file:
data = file.read()
transactions = data.split("\n")
num_trans = len(transactions)
#print(num_trans)

Transactions = load_transaction_data(path_to_data, order)

#innitialization
#for itemsets of 1
C = {}
L = {}
itemset_size = 1
Discarded = {itemset_size : []}
C.update({itemset_size : [ [f] for f in order ]})

#creating L1 (Frequent itemsets)
supp_cnt_L = {}
f, sup, new_discarded = get_frequent(C[itemset_size], Transactions, min_support, Discarded)
Discarded.update({ itemset_size: new_discarded}) #real discarded itemsets from first iteration
L.update({itemset_size : f})
supp_cnt_L.update({itemset_size : sup})

result = (L[1], supp_cnt_L[1])

k = itemset_size + 1
convergence = False
while not convergence:
C.update({k:join_set_itemsets(L[k-1], order)})
print("Result C{} \n".format(k))
tresult= C[k], [count_occurences(it, Transactions) for it in C[k]]
print (result)
print()
f, sup, new_discarded = get_frequent(C[k],Transactions, min_support, Discarded)
Discarded.update({k : new_discarded})
L.update({k : f})
supp_cnt_L.update({k : sup})
if len(L[k]) == 0:
convergence = True
else:
print("Table L{} \n".format(k))
result = L[k], supp_cnt_L[k]
print(result)
print()
k += 1

#Generating association rules according to frequent itemsets above

#starting from frequent sets of size 2 ie (k = 1), since frequent sets of size one can't produce significant rules

assoc_rules_str = ""

for i in range(1, len(L)):
for j in range (len(L[i])):
s = powerset(set(L[i][j]))
s.pop() #subset containing all the elements will be gotten rid of
for z in s:
S = set(z)
X = set(L[i][j])
X_S = set(X-S)
sup_x = count_occurences(X, Transactions)
sup_x_s = count_occurences(X_S, Transactions)
conf = sup_x / count_occurences(S, Transactions)
lift = conf / (sup_x_s/num_trans)
if conf >= min_confidence and sup_x >= min_support:
assoc_rules_str += write_rules(X, X_S, S, conf, sup_x, lift, num_trans)

print(assoc_rules_str)
Loading