import numpy as np
import os


source_files = os.listdir(r'C:\Users\hunte\Documents\hc_tfidf_search')


os.getcwd()
os.chdir(r'C:\Users\hunte\Documents\hc_tfidf_search')
os.getcwd()

'C:\\Users\\hunte\\Documents\\hc_tfidf_search'


file_list = []
documents = []
i = 0
for file in source_files:
    file_list.append(file)
    with open(file,"r", errors="ignore") as f:
        file = f.read()
        file = file.lower()
        doc = "doc" + str(i)
        doc = file
        documents.append(doc)
        i = i + 1


for i in range(0,len(documents)):
    documents[i] = documents[i].split()


import nltk
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
 
lem = WordNetLemmatizer()


docs = []
for i in range(0,len(documents)):
    sentence_results = []
    for w in documents[i]:
        sentence_results.append(lem.lemmatize(w))
    docs.append(sentence_results)


def tfidf(word, sentence):
    tf = sentence.count(word) / len(sentence)
    try:
        idf = np.log10(len(docs) / sum ([1 for doc in docs if word in doc]))
    except ZeroDivisionError:
        idf = 0.0
    return round(tf*idf, 4)


print(tfidf('space',docs[4]))
print(tfidf('sport',docs[1]))

0.0048
0.0266


result = {x for l in docs for x in l}
vocab = sorted(result)
print(len(vocab))

1798


from nltk.corpus import stopwords


stop_words = set(stopwords.words("english"))


new_vocab = [w for w in vocab if not w in stop_words]
print(len(new_vocab))

1708


import string
new_vocab = [''.join(c for c in s if c not in string.punctuation) for s in new_vocab]


vocab = [*set(new_vocab)]
vocab = sorted(vocab)
print(len(vocab))

1523


doc_vectors = {}
main_vector = []
for i in range(0,len(docs)):
    var_name = "vec_doc%d" % i
    doc_vectors[var_name] = i
    
for i in range(0,len(docs)):
    doc_vectors["vec_doc%d" %i] = []
    for word in vocab:
        doc_vectors["vec_doc%d" %i].append(tfidf(word,docs[i]))
    main_vector.append(doc_vectors["vec_doc%d" %i])


query = input("Enter query: ")

Enter query: space telescopes


print(query)

space telescopes


q_list = []
for w in query.split():
    q_list.append(lem.lemmatize(w))

query = ' '.join([str(elem) for elem in q_list])
print(query)

space telescope


q_split = query.split()
q_vector = []
for w in vocab:
    q_vector.append(tfidf(w,q_split))


from numpy.linalg import norm
from numpy import dot


i = 0
cosine_list = []
for v in main_vector:
    cos_sim = dot(v, q_vector)/(norm(v)*norm(q_vector))
    i_cos = i, cos_sim
    cosine_list.append(i_cos)
    i += 1


cosine_list_sorted = sorted(cosine_list, key=lambda tup: tup[1], reverse = True)
print(cosine_list_sorted)

[(4, 0.3110237899386628), (6, 0.04537022899608141), (7, 0.02006837306150367), (0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (5, 0.0)]


print("Here are the results for your query: ")
for j in range(len(cosine_list_sorted)):
    if cosine_list_sorted[j][1] > 0:
        file_number = cosine_list_sorted[j][0]
        print("\t" +file_list[file_number])

Here are the results for your query: 
	james_webb.txt
	nasa_budget.txt
	star_trek.txt

A Private, Customizable Search Engine

For example, a search for the phrase 'space telescopes' returns the three documents in the test corpus in order of their relevance to the query. Namely: james_webb.txt, nasa_budget.txt, and star_trek.txt.¶

Lematize the corpus¶

Get the tf-idf weight for each word in the corpus¶

Get a list of unique words prior to further text preprocessing¶

Remove stop words, punctuation and remove new duplicates¶

Create vectors of tf-idf weights¶

Get user query and vectorize¶

Lemmatize the query¶

Get cosine similarity¶

Return ordered, relevant document(s) to the user¶