# -*- coding: utf-8 -*-
"""
@author: csp5pa1
"""

import nltk

"""
indexer which produces dictionary and postings file
params:
    doc_list:           a list of documents to be indexed
    dictionary_file:    dictionary of terms
    postings_file:      postings file for all terms in dictionary
"""
def indexing(doc_list):
    
    stemmer = nltk.stem.porter.PorterStemmer()
    # get stop words of the english language
    stopwords = None       
    inverted_index = {}     # key: term, value: {postings list}
    
    # for each url in list
    for filename in doc_list:
    
        # get raw text from filename which belongs to gutenburg corpus
        raw_text = nltk.corpus.gutenberg.raw(filename)
    
        # Tokenize raw text, create a list of word tokens
        tokens = None
        
        # for each word term
        for word in tokens:
            term = word.lower()         # casefolding
            if (term in stopwords):    
                continue    # if ignoring stopwords
            if (is_number(term)):        
                continue    # if ignoring numbers
            term = None   # perform stemming
            if (term[-1] == "'"):
                term = term[:-1]        # remove apostrophe
            if (len(term) == 1):         
                continue    # if ignoring single terms
                
            # if term not already in dictionary
            if (term not in inverted_index):
                # define new term in dictionary and create a dictionary to store docID and frequency
                inverted_index[term] = dict()
            
            # if filename not already in posting list of term
            if filename not in inverted_index[term]:
                # add filename and initialize frequency of term in filename to 1
                inverted_index[term][filename] = 1
            # else increase frequency of term in filename
            else:
                inverted_index[term][filename] += 1    

    return inverted_index

"""
This function is modified from: http://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-is-a-number-in-python
returns True if the token is a number else false
param:
    token:  token string
"""
def is_number(token):
    token = token.replace(",", "")  # ignore commas in token
    # tries if token can be parsed as float
    try:
        float(token)
        return True
    except ValueError:
        return False

doc_list = nltk.corpus.gutenberg.fileids()
inverted_index = indexing(doc_list)    # call the indexer

# print posting list for the word carpet
print("Posting list for the word carpet: ", None)   # {'austen-persuasion.txt': 1, 'bryant-stories.txt': 1, 'chesterton-ball.txt': 1, 'chesterton-brown.txt': 5, 'edgeworth-parents.txt': 4, 'melville-moby_dick.txt': 5, 'whitman-leaves.txt': 2}
# print posting list for the word troop
print("Posting list for the word troop: ", None)     # {'austen-sense.txt': 1, 'bible-kjv.txt': 20, 'bryant-stories.txt': 2, 'melville-moby_dick.txt': 3, 'milton-paradise.txt': 4, 'shakespeare-caesar.txt': 2, 'shakespeare-macbeth.txt': 1, 'whitman-leaves.txt': 11}
# print posting list for the word overburden
print("Posting list for the word overburden: ", None)   # {'melville-moby_dick.txt': 1}
# carpet AND troop
def search1(inverted_index):
    # filenames in posting list for the word carpet
    posting1 = inverted_index["carpet"].keys()
    # filenames in posting list for the word troop
    posting2 = inverted_index["troop"].keys()
    return posting1 & posting2 # & computes set intersection

print(search1(inverted_index)) # put the result 

# carpet AND troop AND NOT overburden
def search2(inverted_index):
    # posting list for the word carpet
	posting1 = inverted_index["carpet"].keys()
    # posting list for the word troop
	posting2 = inverted_index["troop"].keys()
    # posting list for the word overburden
	posting3 = inverted_index["overburden"].keys()
	return (posting1 & posting2) - posting3 # - computes set difference

print(search2(inverted_index)) # put the result