useless.py

# -*- coding: utf-8 -*-
# """IR_Assignment1.ipynb

# Automatically generated by Colaboratory.

# Original file is located at
#     https://colab.research.google.com/drive/1ByabD3y9BBb3Pwz8rA5wqrdYx2FthNsZ
# """

# from google.colab import drive
# drive.mount('/content/drive')

#importing the necessary libraries
# import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer, PorterStemmer
# from nltk.tokenize import sent_tokenize , word_tokenize
# import glob
# import re
# import os
# import numpy as np
# import re
# import sys
# Stopwords = set(stopwords.words('english'))

def finding_all_unique_words_and_freq(words):
    """
    finds all the unique words and their frequency from the given dataset of IR Corpus
    
    Args:
    
    `words`:  words processed from the document

    Returns:
    
    The frequency of the given word in the function
    """
    words_unique = []
    word_freq = {}
    for word in words:
        if word not in words_unique:
            words_unique.append(word)
    for word in words_unique:
        word_freq[word] = words.count(word)
    return word_freq

def finding_freq_of_word_in_doc(word,words):
    """
    Counts the frequency fof the word in the document

    Args:
    `word`: word form the document

    `words`:counts the frequency for the given in the document
    """
    freq = words.count(word)
        
def remove_special_characters(text):
    """
    This function removes the special charactes if any in the document so that it is easy to process the document.

    Return:

    The text after removing the special characters. 
    """
    regex = re.compile('[^a-zA-Z0-9\s]')
    text_returned = re.sub(regex,'',text)
    return text_returned

def fileOpeningProcessing():
    """
    Opens  the file and sends for preprocessing
    such as removing special characters.

    """
    all_words = []
    dict_global = {}
    # file_folder = '/home/fb/Desktop/IR/assignment/IR_CORPUS'
    idx = 1
    files_with_index = {}
    # list_files = [f for f in glob.glob('/content/IR_CORPUS/*.txt')]
    list_files = [f for f in glob.glob('/content/drive/MyDrive/IR_ASSIGNMENT/IR_CORPUS/*.txt')]

    # for file in glob.glob(file_folder):
    for file in list_files:
        # print(file)
        # print(idx)
        fname = file
        file = open(file , "r")
        text = file.read()
        text = remove_special_characters(text)
        text = re.sub(re.compile('\d'),'',text)
        sentences = sent_tokenize(text)
        words = word_tokenize(text)
        words = [word for word in words if len(words)>1]
        words = [word.lower() for word in words]
        words = [word for word in words if word not in Stopwords]
        # words = [PorterStemmer().stem(word) for word in words]
        dict_global.update(finding_all_unique_words_and_freq(words))
        files_with_index[idx] = os.path.basename(fname)
        idx = idx + 1
        
    unique_words_all = set(dict_global.keys())

class Node:
    """
    Class for Node
    """
    def __init__(self ,docId, freq = None):
        self.freq = freq
        self.doc = docId
        self.nextval = None
    
class SlinkedList:
    def __init__(self ,head = None):
        self.head = head

def preprocessing():
    """
    The following function preprocesses the document IR Corpus.
    """
    linked_list_data = {}
    for word in unique_words_all:
        linked_list_data[word] = SlinkedList()
        linked_list_data[word].head = Node(1,Node)
    word_freq_in_doc = {}
    idx = 1
    # list_files = [f for f in glob.glob('/content/IR_CORPUS/*txt')]
    list_files = [f for f in glob.glob('/content/drive/MyDrive/IR_ASSIGNMENT/IR_CORPUS/*.txt')]

    # for file in glob.glob(file_folder):
    for file in list_files:
        file = open(file, "r")
        text = file.read()
        # print(text)
        # print(idx)
        text = remove_special_characters(text)
        text = re.sub(re.compile('\d'),'',text)
        sentences = sent_tokenize(text)
        words = word_tokenize(text)
        words = [word for word in words if len(words)>1]
        words = [word.lower() for word in words]
        words = [word for word in words if word not in Stopwords]
        # words = [PorterStemmer().stem(word) for word in words]
        word_freq_in_doc = finding_all_unique_words_and_freq(words)
        for word in word_freq_in_doc.keys():
            linked_list = linked_list_data[word].head
            while linked_list.nextval is not None:
                linked_list = linked_list.nextval
            linked_list.nextval = Node(idx ,word_freq_in_doc[word])
        idx = idx + 1
        # print(text)

# list_files = [f for f in glob.glob('/content/IR_CORPUS/*.txt')]
# print(list_files)

# """# Ignore"""

# from nltk.metrics.distance  import edit_distance
# from nltk.metrics.distance import jaccard_distance
# from nltk.util import ngrams
# query = input('Enter your query:')
# query = word_tokenize(query)
# connecting_words = []
# cnt = 1
# k = 0 #retrieve top k+1 words for edit distance
# different_words = []
# zeroes_and_ones = []
# edited_query_words = []
# zeroes_and_ones_of_all_words = []
# total_files = len(files_with_index)

# for word in query:
#     if word.lower() != "and" and word.lower() != "or" and word.lower() != "not":
#         different_words.append(word.lower())
#     else:
#         connecting_words.append(word.lower())
# skip_edit_distance = [0]*len(different_words)

# #if in unique words
# i = 0
# for query_word in different_words: 
#     # print(query_word)
#     if query_word in unique_words_all:
#         skip_edit_distance[i] = 1
#         zeroes_and_ones = [0] * total_files
#         linkedlist = linked_list_data[query_word].head
#         # print("word:",word)
#         while linkedlist.nextval is not None:
#             zeroes_and_ones[linkedlist.nextval.doc - 1] = 1
#             linkedlist = linkedlist.nextval
#         print("unique available :> zeroes_and_ones for ",query_word," :>",zeroes_and_ones)
#         zeroes_and_ones_of_all_words.append(zeroes_and_ones)
#     i+=1

# # if not in unique words
# for i,entry in enumerate(different_words):
#     # print(i,entry)
#     if not skip_edit_distance[i]:
#             temp = [(edit_distance(entry, w,substitution_cost = 2,transpositions = True),w) for w in unique_words_all]
#             all_words_sorted = sorted(temp)
#             # print("all_words_sorted :", all_words_sorted)
#             edited_query_words.append(all_words_sorted[0])


# print("edited_query_words after edit distance: ",edited_query_words)
# for entry in edited_query_words:
#             if entry[1] in unique_words_all:
#                 zeroes_and_ones = [0] * total_files
#                 linkedlist = linked_list_data[entry[1]].head
#                 # print(word)
#                 while linkedlist.nextval is not None:
#                     zeroes_and_ones[linkedlist.nextval.doc - 1] = 1
#                     linkedlist = linkedlist.nextval
#                 print("edit dist :> zeroes_and_ones for ",entry[1]," :>",zeroes_and_ones)
#                 zeroes_and_ones_of_all_words.append(zeroes_and_ones)
#             # else : 
#             #     print(word," not found")
#             #     sys.exit()


# # print("zeroes_and_ones_of_all_words: ",zeroes_and_ones_of_all_words)
# for word in connecting_words:
#     word_list1 = zeroes_and_ones_of_all_words[0]
#     word_list2 = zeroes_and_ones_of_all_words[1]
#     if word == "and":
#         bitwise_op = [w1 & w2 for (w1,w2) in zip(word_list1,word_list2)]
#         zeroes_and_ones_of_all_words.remove(word_list1)
#         zeroes_and_ones_of_all_words.remove(word_list2)
#         zeroes_and_ones_of_all_words.insert(0, bitwise_op);
#     elif word == "or":
#         bitwise_op = [w1 | w2 for (w1,w2) in zip(word_list1,word_list2)]
#         zeroes_and_ones_of_all_words.remove(word_list1)
#         zeroes_and_ones_of_all_words.remove(word_list2)
#         zeroes_and_ones_of_all_words.insert(0, bitwise_op);
#     elif word == "not":
#         bitwise_op = [not w1 for w1 in word_list2]
#         bitwise_op = [int(b == True) for b in bitwise_op]
#         zeroes_and_ones_of_all_words.remove(word_list2)
#         zeroes_and_ones_of_all_words.remove(word_list1)
#         bitwise_op = [w1 & w2 for (w1,w2) in zip(word_list1,bitwise_op)]
#         zeroes_and_ones_of_all_words.insert(0, bitwise_op);


# files = []    
# lis = zeroes_and_ones_of_all_words[0]
# print("final zeros after binary operation:",lis)
# cnt = 1
# for index in lis:
#     if index == 1:
#         files.append(files_with_index[cnt])
#     cnt = cnt+1
    
# print(files)

# """#To be used"""

# list_test = []
# for word in unique_words_all:
#     if re.search("^bru.*s$",word):
#         list_test.append(word)
# print(list_test)

def wildcard(word):
    """
    Utility function for wildcard searching
    """
    wildcard_words = []
    word_list = word.split("*")
    re_query = "^"+ word_list[0]+".*"+word_list[1]+"$"
    # print(re_query)
    for word in unique_words_all:
        if re.search(re_query,word):
            wildcard_words.append(word)        
    return wildcard_words

def wildcard_search(word):
    """
    This function searches the query using wildcard searching if the token/word contains * in it. 
    """
    total_files = len(files_with_index)
    zeroes_and_ones_of_all_words = []
    # word = "br*us"
    #if in unique words
    i = 0
    for query_word in wildcard(word): 
        # print(query_word)
        if query_word in unique_words_all:
            # skip_edit_distance[i] = 1
            zeroes_and_ones = [0] * total_files
            linkedlist = linked_list_data[query_word].head
            # print("word:",word)
            while linkedlist.nextval is not None:
                zeroes_and_ones[linkedlist.nextval.doc - 1] = 1
                linkedlist = linkedlist.nextval
            print("unique available :> zeroes_and_ones for ",query_word," :>",zeroes_and_ones)
            zeroes_and_ones_of_all_words.append(zeroes_and_ones)
        i+=1

    res = [0] * total_files
    for zr_and_ons in zeroes_and_ones_of_all_words:
        res = boolean_or(res,zr_and_ons)
    return res

# from nltk.metrics.distance  import edit_distance
# from nltk.metrics.distance import jaccard_distance
# from nltk.util import ngrams
# import collections

# k = 0 #retrieve top k+1 words for edit distance
# edited_query_words = []

def boolean_and(left_op, right_op):
    """
    Simple boolean bitwise AND operation.
    Args:
    `left_op`: left side for the bitwise operation

    `right_op`: right side for the bitwise operation

    Return:
    
    The result bitwise AND of the left and right as given arguments.

    """
    bitwise_op = [w1 & w2 for (w1,w2) in zip(left_op,right_op)]
    return bitwise_op

def boolean_or(left_op, right_op):

    """
    Simple boolean bitwise OR operation.
    Args:
    `left_op`: left side for the bitwise operation

    `right_op`: right side for the bitwise operation

    Return:
    
    The result bitwise OR of the left and right as given arguments.

    """
    bitwise_op = [w1 | w2 for (w1,w2) in zip(left_op,right_op)]
    return bitwise_op

def boolean_not(right_op):
    """
    Simple boolean bitwise NOT operation.

    Args:
    `left_op`: left side for the bitwise operation

    `right_op`: right side for the bitwise operation

    Return:
    
    The result bitwise NOT of the left and right as given arguments.

    """
    bitwise_op = [not w1 for w1 in right_op]
    bitwise_op = [int(b == True) for b in bitwise_op]
    return bitwise_op

def shunting_yard(infix_tokens):
    """
    Definition for the precedence for the boolean operators such as AND, OR and NOT
    Also defines precedence for ')' and '(' brackets.
    
    Args:
    `infix_tokens`: Infix sequence of the connecting words which are AND, OR and NOT.

    Returns:
    The result after modifying the given tokens.
    """
    # define precedences
    precedence = {}
    precedence['or'] = 1
    precedence['and'] = 2
    precedence['not'] = 3
    precedence['('] = 0
    precedence[')'] = 0    

    # declare data strucures
    output = []
    operator_stack = []

    # while there are tokens to be read
    for token in infix_tokens:
        
        # if left bracket
        if (token == '('):
            operator_stack.append(token)
        
        # if right bracket, pop all operators from operator stack onto output until we hit left bracket
        elif (token == ')'):
            operator = operator_stack.pop()
            while operator != '(':
                output.append(operator)
                operator = operator_stack.pop()
        
        # if operator, pop operators from operator stack to queue if they are of higher precedence
        elif (token in precedence):
            # if operator stack is not empty
            if (operator_stack):
                current_operator = operator_stack[-1]
                while (operator_stack and precedence[current_operator] > precedence[token]):
                    output.append(operator_stack.pop())
                    if (operator_stack):
                        current_operator = operator_stack[-1]

            operator_stack.append(token) # add token to stack
        
        # else if operands, add to output list
        else:
            output.append(token.lower())

    # while there are still operators on the stack, pop them into the queue
    while (operator_stack):
        output.append(operator_stack.pop())
    # print ('postfix:', output)  # check
    return output


def process_query(query):
  """
    Utility function for processing the user query
  """
  query = query.lower()
  query = query.replace('(', '( ')
  query = query.replace(')', ' )')
  query = query.split(' ')

  print("query in process_query", query)

  k = 0 #retrieve top k+1 words for edit distance
  connecting_words = []
  different_words = []
  zeroes_and_ones = []
  edited_query_words = []
  zeroes_and_ones_of_all_words = []
  total_files = len(files_with_index)

  # for word in query:
  #     if word != "and" and word != "or" and word != "not":
  #         different_words.append(word)
  #     else:
  #         connecting_words.append(word)
  skip_edit_distance = [0]*len(query)
  # print(skip_edit_distance)

  #if in unique words
  for i, query_word in enumerate(query): 
      # print(query_word)
      if query_word != "and" and query_word != "or" and query_word != "not" and query_word != '(' and query_word != ')':
        if query_word not in unique_words_all:
            if "*" in query_word:
                skip_edit_distance[i] = 0
            else:
                skip_edit_distance[i] = 1
            # print("unique available :> zeroes_and_ones for ",query_word)
  # print(skip_edit_distance)

  # if not in unique words
  for i,entry in enumerate(query):
      # print(i,entry)
      if skip_edit_distance[i]:
              temp = [(edit_distance(entry, w,substitution_cost = 2,transpositions = True),w) for w in unique_words_all]
              all_words_sorted = sorted(temp)
              # print("all_words_sorted :", all_words_sorted[0:10])
              # edited_query_words.append(all_words_sorted[0], i)
              query[i] = all_words_sorted[0][1]

  print("edited_query_words after edit distance: ",query)
  # for i,entry in enumerate(different_words):
  #   if i in edited_query_words[1]:

  result_stack = []
  print("query :",query)
  postfix_queue = collections.deque(shunting_yard(query))
  # print(postfix_queue)
  result = []
  while postfix_queue:
    # print(result_stack)
    # print(postfix_queue)
    # result_stack.append(result)
    token = postfix_queue.popleft()
    result = []
    temp = []
    temp = [0] * total_files
    # print(total_files)
    if (token != 'and' and token != 'or' and token != 'not'):
      if "*" in token:
          result = wildcard_search(token) 
          print("result of OR of wildcard: ",result)
      else:  
        # token = PorterStemmer().stem(token)
        print(token)
        if token in unique_words_all:
          # print(token)
          linkedlist = linked_list_data[token].head
          while linkedlist.nextval is not None:
            temp[linkedlist.nextval.doc - 1] = 1
            linkedlist = linkedlist.nextval
            # print(temp)
          result = temp
          # print(result)
    elif (token == 'and'):
      right_op = result_stack.pop()
      left_op = result_stack.pop()
      result = boolean_and(left_op, right_op)
    elif (token == 'or'):
      right_op = result_stack.pop()
      left_op = result_stack.pop()
      result = boolean_or(left_op, right_op)
    elif (token == 'not'):
      right_op = result_stack.pop()
      result = boolean_not(right_op)
    else:
        if token!=")" or token!=")" or token!='':
            token = PorterStemmer().stem(token)
    # print(result)
    result_stack.append(result)
    # print(result_stack)
  # if len(result_stack) != 1 : print("Error: result_stack. Please check query")
  return result_stack.pop()


def do_quering(query):
    """
    Process query given by the user

  Args:
  `query`: user query
  
  Returns:
  The final stack after processing the query such as lementisation, stemming, and edit-distance
    
    """
    # query = input('Enter your query: ')
    # print(process_query(query))
    res = process_query(query)
    print("Final result : ",res)
    cnt = 1
    files = []
    for index in res:
        if index == 1:
            files.append(files_with_index[cnt])
        cnt = cnt+1
    if len(files):
        print(files)
    else:
        print("No such word present in the documents")
    # process_query(query)

# do_quering("(julius and caesar)")

# do_quering("call or not (julius and caesar)")

# do_quering("call or not (julius and caesar) and a*")