Assignment 5/packages/run_apriori.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102

import numpy as np
from subprocess import call
import re
import os

'''
 run_apriory.py 
 version: 0.2
 last change: 26/11/2014, by Wout Megchelenbrink
 fixed lots of issues
'''

'''
 Adapted by Camil Staps 2015/12/12
 
  * Make it a function
  * Allow for calling it from another directory
'''

def run_apriori(filename, minSup, minConf, maxRule):
    from sys import exit, platform as _platform
    
    # Run Apriori Algorithm
    print('Mining for frequent itemsets by the Apriori algorithm')
    
    if _platform == 'linux' or _platform == 'linux2':
        cmd = '"' + os.path.dirname(os.path.realpath(__file__)) + '/apriori"'
    elif _platform == 'darwin':
        cmd = '"' + os.path.dirname(os.path.realpath(__file__)) + '/aprioriMAC"'
    elif _platform == 'win32':
        cmd = '"' + os.path.dirname(os.path.realpath(__file__)) + '/apriori.exe"'
        
    status1 = call(cmd + " -s{0} -v\"[Sup. %3S]\" {1} apriori_temp1.txt".format(minSup, filename), shell=True)
    
    if status1!=0:
        print('An error occured while calling apriori, a likely cause is that minSup was set to high such that no frequent itemsets were generated or spaces are included in the path to the apriori files.')
        exit()
    if minConf>0:
        print('Mining for associations by the Apriori algorithm')
        
        status2 = call(cmd + ' -tr -f"," -o -n{0} -c{1} -s{2} -v"[Conf. %3C,Sup. %3S]" {3} apriori_temp2.txt'.format(maxRule, minConf, minSup, filename), shell=True)
            
        if status2!=0:
            print('An error occured while calling apriori')
            exit()
    print('Apriori analysis done, extracting results')
    
    
    # Extract information from stored files apriori_temp1.txt and apriori_temp2.txt
    f = open('apriori_temp1.txt','r')
    lines = f.readlines()
    f.close()
    # Extract Frequent Itemsets
    FrequentItemsets = ['']*len(lines)
    sup = np.zeros((len(lines),1))
    
    for i,line in enumerate(lines):
        FrequentItemsets[i] = line[0:-1]
        tmpSupport = re.findall(' \d*[.]\d*', line)
        if len(tmpSupport) == 0:
            tmpSupport = re.findall(' \d*', line)
        
        sup[i] = tmpSupport[0]
    os.remove('apriori_temp1.txt')
        
    # Read the file
    f = open('apriori_temp2.txt','r')
    lines = f.readlines()
    f.close()
    # Extract Association rules
    AssocRules = ['']*len(lines)
    conf = np.zeros((len(lines),1))
    for i,line in enumerate(lines):
        AssocRules[i] = line[0:-1]
        
        tmpConf = re.findall(' \d*[.]\d*,', line)    
        if len(tmpConf) == 0:
            tmpConf = re.findall(' \d*,', line)    
        
        conf[i] = tmpConf[0][1:-1]
    os.remove('apriori_temp2.txt')    
    
    # sort (FrequentItemsets by support value, AssocRules by confidence value)
    AssocRulesSorted = [AssocRules[item] for item in np.argsort(conf,axis=0).ravel()]
    AssocRulesSorted.reverse()
    FrequentItemsetsSorted = [FrequentItemsets[item] for item in np.argsort(sup,axis=0).ravel()]
    FrequentItemsetsSorted.reverse()
    
    return FrequentItemsetsSorted, AssocRulesSorted
    
def apriori_print(FrequentItemsetsSorted, AssocRulesSorted):
    # Print the results
    import time; time.sleep(.5)
    print('\n')
    print('RESULTS:\n')
    print('Frequent itemsets:')
    for i,item in enumerate(FrequentItemsetsSorted):
        print('Item: {0}'.format(item))
    print('\n')
    print('Association rules:')
    for i,item in enumerate(AssocRulesSorted):
        print('Rule: {0}'.format(item))