|
| 1 | +import string |
| 2 | +from math import log10 |
| 3 | + |
| 4 | +""" |
| 5 | + tf-idf Wikipedia: https://en.wikipedia.org/wiki/Tf%E2%80%93idf |
| 6 | + tf-idf and other word frequency algorithms are often used |
| 7 | + as a weighting factor in information retrieval and text |
| 8 | + mining. 83% of text-based recommender systems use |
| 9 | + tf-idf for term weighting. In Layman's terms, tf-idf |
| 10 | + is a statistic intended to reflect how important a word |
| 11 | + is to a document in a corpus (a collection of documents) |
| 12 | +
|
| 13 | +
|
| 14 | + Here I've implemented several word frequency algorithms |
| 15 | + that are commonly used in information retrieval: Term Frequency, |
| 16 | + Document Frequency, and TF-IDF (Term-Frequency*Inverse-Document-Frequency) |
| 17 | + are included. |
| 18 | +
|
| 19 | + Term Frequency is a statistical function that |
| 20 | + returns a number representing how frequently |
| 21 | + an expression occurs in a document. This |
| 22 | + indicates how significant a particular term is in |
| 23 | + a given document. |
| 24 | +
|
| 25 | + Document Frequency is a statistical function that returns |
| 26 | + an integer representing the number of documents in a |
| 27 | + corpus that a term occurs in (where the max number returned |
| 28 | + would be the number of documents in the corpus). |
| 29 | +
|
| 30 | + Inverse Document Frequency is mathematically written as |
| 31 | + log10(N/df), where N is the number of documents in your |
| 32 | + corpus and df is the Document Frequency. If df is 0, a |
| 33 | + ZeroDivisionError will be thrown. |
| 34 | +
|
| 35 | + Term-Frequency*Inverse-Document-Frequency is a measure |
| 36 | + of the originality of a term. It is mathematically written |
| 37 | + as tf*log10(N/df). It compares the number of times |
| 38 | + a term appears in a document with the number of documents |
| 39 | + the term appears in. If df is 0, a ZeroDivisionError will be thrown. |
| 40 | +""" |
| 41 | + |
| 42 | + |
| 43 | +def term_frequency(term : str, document : str) -> int: |
| 44 | + """ |
| 45 | + Return the number of times a term occurs within |
| 46 | + a given document. |
| 47 | + @params: term, the term to search a document for, and document, |
| 48 | + the document to search within |
| 49 | + @returns: an integer representing the number of times a term is |
| 50 | + found within the document |
| 51 | +
|
| 52 | + @examples: |
| 53 | + >>> term_frequency("to", "To be, or not to be") |
| 54 | + 2 |
| 55 | + """ |
| 56 | + # strip all punctuation and newlines and replace it with '' |
| 57 | + document_without_punctuation = document.translate( |
| 58 | + str.maketrans("", "", string.punctuation) |
| 59 | + ).replace("\n", "") |
| 60 | + tokenize_document = document_without_punctuation.split(" ") # word tokenization |
| 61 | + return len( |
| 62 | + [word for word in tokenize_document if word.lower() == term.lower()] |
| 63 | + ) |
| 64 | + |
| 65 | + |
| 66 | +def document_frequency(term: str, corpus: str) -> int: |
| 67 | + """ |
| 68 | + Calculate the number of documents in a corpus that contain a |
| 69 | + given term |
| 70 | + @params : term, the term to search each document for, and corpus, a collection of |
| 71 | + documents. Each document should be separated by a newline. |
| 72 | + @returns : the number of documents in the corpus that contain the term you are |
| 73 | + searching for and the number of documents in the corpus |
| 74 | + @examples : |
| 75 | + >>> document_frequency("first", "This is the first document in the corpus.\\nThIs\ |
| 76 | +is the second document in the corpus.\\nTHIS is \ |
| 77 | +the third document in the corpus.") |
| 78 | + (1, 3) |
| 79 | + """ |
| 80 | + corpus_without_punctuation = corpus.translate( |
| 81 | + str.maketrans("", "", string.punctuation) |
| 82 | + ) # strip all punctuation and replace it with '' |
| 83 | + documents = corpus_without_punctuation.split("\n") |
| 84 | + lowercase_documents = [document.lower() for document in documents] |
| 85 | + return len( |
| 86 | + [document for document in lowercase_documents if term.lower() in document] |
| 87 | + ), len(documents) |
| 88 | + |
| 89 | + |
| 90 | +def inverse_document_frequency(df : int, N: int) -> float: |
| 91 | + """ |
| 92 | + Return an integer denoting the importance |
| 93 | + of a word. This measure of importance is |
| 94 | + calculated by log10(N/df), where N is the |
| 95 | + number of documents and df is |
| 96 | + the Document Frequency. |
| 97 | + @params : df, the Document Frequency, and N, |
| 98 | + the number of documents in the corpus. |
| 99 | + @returns : log10(N/df) |
| 100 | + @examples : |
| 101 | + >>> inverse_document_frequency(3, 0) |
| 102 | + Traceback (most recent call last): |
| 103 | + ... |
| 104 | + ValueError: log10(0) is undefined. |
| 105 | + >>> inverse_document_frequency(1, 3) |
| 106 | + 0.477 |
| 107 | + >>> inverse_document_frequency(0, 3) |
| 108 | + Traceback (most recent call last): |
| 109 | + ... |
| 110 | + ZeroDivisionError: df must be > 0 |
| 111 | + """ |
| 112 | + if df == 0: |
| 113 | + raise ZeroDivisionError("df must be > 0") |
| 114 | + elif N == 0: |
| 115 | + raise ValueError("log10(0) is undefined.") |
| 116 | + return round(log10(N / df), 3) |
| 117 | + |
| 118 | + |
| 119 | +def tf_idf(tf : int, idf: int) -> float: |
| 120 | + """ |
| 121 | + Combine the term frequency |
| 122 | + and inverse document frequency functions to |
| 123 | + calculate the originality of a term. This |
| 124 | + 'originality' is calculated by multiplying |
| 125 | + the term frequency and the inverse document |
| 126 | + frequency : tf-idf = TF * IDF |
| 127 | + @params : tf, the term frequency, and idf, the inverse document |
| 128 | + frequency |
| 129 | + @examples : |
| 130 | + >>> tf_idf(2, 0.477) |
| 131 | + 0.954 |
| 132 | + """ |
| 133 | + return round(tf * idf, 3) |
0 commit comments