# -*- coding: utf-8 -*- """ @author: csp5pa1 """ import nltk """ indexer which produces dictionary and postings file params: doc_list: a list of documents to be indexed dictionary_file: dictionary of terms postings_file: postings file for all terms in dictionary """ def indexing(doc_list): stemmer = nltk.stem.porter.PorterStemmer() # get stop words of the english language stopwords = None inverted_index = {} # key: term, value: {postings list} # for each url in list for filename in doc_list: # get raw text from filename which belongs to gutenburg corpus raw_text = nltk.corpus.gutenberg.raw(filename) # Tokenize raw text, create a list of word tokens tokens = None # for each word term for word in tokens: term = word.lower() # casefolding if (term in stopwords): continue # if ignoring stopwords if (is_number(term)): continue # if ignoring numbers term = None # perform stemming if (term[-1] == "'"): term = term[:-1] # remove apostrophe if (len(term) == 1): continue # if ignoring single terms # if term not already in dictionary if (term not in inverted_index): # define new term in dictionary and create a dictionary to store docID and frequency inverted_index[term] = dict() # if filename not already in posting list of term if filename not in inverted_index[term]: # add filename and initialize frequency of term in filename to 1 inverted_index[term][filename] = 1 # else increase frequency of term in filename else: inverted_index[term][filename] += 1 return inverted_index """ This function is modified from: http://stackoverflow.com/questions/354038/how-do-i-check-if-a-string-is-a-number-in-python returns True if the token is a number else false param: token: token string """ def is_number(token): token = token.replace(",", "") # ignore commas in token # tries if token can be parsed as float try: float(token) return True except ValueError: return False doc_list = nltk.corpus.gutenberg.fileids() inverted_index = indexing(doc_list) # call the indexer # print posting list for the word carpet print("Posting list for the word carpet: ", None) # {'austen-persuasion.txt': 1, 'bryant-stories.txt': 1, 'chesterton-ball.txt': 1, 'chesterton-brown.txt': 5, 'edgeworth-parents.txt': 4, 'melville-moby_dick.txt': 5, 'whitman-leaves.txt': 2} # print posting list for the word troop print("Posting list for the word troop: ", None) # {'austen-sense.txt': 1, 'bible-kjv.txt': 20, 'bryant-stories.txt': 2, 'melville-moby_dick.txt': 3, 'milton-paradise.txt': 4, 'shakespeare-caesar.txt': 2, 'shakespeare-macbeth.txt': 1, 'whitman-leaves.txt': 11} # print posting list for the word overburden print("Posting list for the word overburden: ", None) # {'melville-moby_dick.txt': 1} # carpet AND troop def search1(inverted_index): # filenames in posting list for the word carpet posting1 = inverted_index["carpet"].keys() # filenames in posting list for the word troop posting2 = inverted_index["troop"].keys() return posting1 & posting2 # & computes set intersection print(search1(inverted_index)) # put the result # carpet AND troop AND NOT overburden def search2(inverted_index): # posting list for the word carpet posting1 = inverted_index["carpet"].keys() # posting list for the word troop posting2 = inverted_index["troop"].keys() # posting list for the word overburden posting3 = inverted_index["overburden"].keys() return (posting1 & posting2) - posting3 # - computes set difference print(search2(inverted_index)) # put the result