From 24632d4320c7c83849446272a518511c4201bee2 Mon Sep 17 00:00:00 2001 From: = Date: Sun, 21 Jun 2020 12:11:04 -0400 Subject: [PATCH 01/19] NLP Word Frequency Algorithms --- machine_learning/word_frequency_functions.py | 136 +++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 machine_learning/word_frequency_functions.py diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py new file mode 100644 index 000000000000..a709f029ce13 --- /dev/null +++ b/machine_learning/word_frequency_functions.py @@ -0,0 +1,136 @@ +import string +from math import log10 + +""" Here I've implemented several word frequency functions + that are commonly used in information retrieval: Term Frequency, + Document Frequency, and TF-IDF (Term-Frequency*Inverse-Document-Frequency) + are included. + + Term Frequency is a statistical function that + returns a number representing how frequently + an expression occurs in a document.This + indicates how significant a particular term is in + a given document. + + Document Frequency is a statistical function that returns + an integer representing + the number of documents in a corpus that a term occurs in + (where the max integer returned would be the number of + documents in the corpus). + + Inverse Document Frequency is mathematically written as + log10(N/df), where N is the number of documents in your + corpus and df is the Document Frequency. If df is 0, a + ZeroDivisionError will be thrown. + + Term-Frequency*Inverse-Document-Frequency is a measure + of the originality of a term. It is mathematically written + as tf*log10(N/df). It compares the number of times + a term appears in a document with the number of documents + the term appears in. If df is 0, a ZeroDivisionError will be thrown. +""" + + +def term_frequency(term, document): + """ + A function that returns the number of times a term occurs within + a given document. + @params: term, the term to search a document for, and document, + the document to search within + @returns: an integer representing the number of times a term is + found within the document + + @examples: + >>> document = "To be, or not to be" + >>> term = "to" + 2 + + >>> document = "Natural Language Processing is a subfield of Artificial Intelligence + concerned with interactions between computers and human languages" + >>> term = "NLP" + 0 + """ + # strip all punctuation and newlines and replace it with '' + document_without_punctuation = document.translate( + str.maketrans("", "", string.punctuation) + ).replace("\n", "") + tokenize_document = document_without_punctuation.split(" ") # word tokenization + term_frequency = len( + [word for word in tokenize_document if word.lower() == term.lower()] + ) + return term_frequency + + +def document_frequency(term, corpus): + """ + A function that calculates the number of documents in a corpus that contain a + given term + @params : term, the term to search each document for, and corpus, a collection of + documents. Each document should be separated by a newline. + @returns : the number of documents in the corpus that contain the term you are + searching for and the number of documents in the corpus + @examples : + >>> corpus = + "This is the first document in the corpus.\n + ThIs is the second document in the corpus.\n + THIS is the third document in the corpus." + >>> term = "first" + 1 + >>> term = "document" + 3 + >>> term = "this" + 3 + """ + corpus_without_punctuation = corpus.translate( + str.maketrans("", "", string.punctuation) + ) # strip all punctuation and replace it with '' + documents = corpus_without_punctuation.split("\n") + lowercase_documents = [document.lower() for document in documents] + document_frequency = len( + [document for document in lowercase_documents if term.lower() in document] + ) # number of documents that contain the term + return document_frequency, len(documents) + + +def inverse_document_frequency(df, N): + """ + A function that returns an integer denoting the importance + of a word. This measure of importance is + calculated by log10(N/df), where N is the + number of documents and df is + the Document Frequency. + @params : df, the Document Frequency, and corpus, + a collection of documents separated + by a newline. + @returns : log10(N/df) + @examples : + >>> df = 1 + >>> corpus = + "This is the first document in the corpus.\n + ThIs is the second document in the corpus.\n + THIS is the third document in the corpus." + log10(3/1) = .477 + >>> df = 3 + log10(3/3) = log10(1) = 0 + >>> df = 0 + log10(3/0) -> throws ZeroDivisionError + """ + try: + idf = round(log10(N / df), 3) + return idf + except ZeroDivisionError: + print("The term you searched for is not in the corpus.") + + +def tf_idf(tf, idf): + """ + A function that combines the term frequency + and inverse document frequency functions to + calculate the originality of a term. This + 'originality' is calculated by multiplying + the term frequency and the inverse document + frequency : tf-idf = TF * IDF + @params : tf, the term frequency, and idf, the inverse document + frequency + """ + return round(tf * idf, 3) From cbb5f41d122a39ae170864b5a1bb82400f9ea127 Mon Sep 17 00:00:00 2001 From: = Date: Mon, 22 Jun 2020 17:23:51 -0400 Subject: [PATCH 02/19] Added type hints and Wikipedia link to tf-idf --- machine_learning/word_frequency_functions.py | 39 +++++++++++--------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index a709f029ce13..a9e8211543cd 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -1,22 +1,31 @@ import string from math import log10 -""" Here I've implemented several word frequency functions +""" + tf-idf Wikipedia: https://en.wikipedia.org/wiki/Tf%E2%80%93idf + tf-idf and other word frequency algorithms are often used + as a weighting factor in information retrieval and text + mining. 83% of text-based recommender systems use + tf-idf for term weighting. In Layman's terms, tf-idf + is a statistic intended to reflect how important a word + is to a document in a corpus (a collection of documents) + + + Here I've implemented several word frequency algorithms that are commonly used in information retrieval: Term Frequency, Document Frequency, and TF-IDF (Term-Frequency*Inverse-Document-Frequency) are included. Term Frequency is a statistical function that returns a number representing how frequently - an expression occurs in a document.This + an expression occurs in a document. This indicates how significant a particular term is in a given document. Document Frequency is a statistical function that returns - an integer representing - the number of documents in a corpus that a term occurs in - (where the max integer returned would be the number of - documents in the corpus). + an integer representing the number of documents in a + corpus that a term occurs in (where the max number returned + would be the number of documents in the corpus). Inverse Document Frequency is mathematically written as log10(N/df), where N is the number of documents in your @@ -31,7 +40,7 @@ """ -def term_frequency(term, document): +def term_frequency(term : str, document : str) -> int: """ A function that returns the number of times a term occurs within a given document. @@ -61,7 +70,7 @@ def term_frequency(term, document): return term_frequency -def document_frequency(term, corpus): +def document_frequency(term: str, corpus: str) -> int: """ A function that calculates the number of documents in a corpus that contain a given term @@ -92,23 +101,19 @@ def document_frequency(term, corpus): return document_frequency, len(documents) -def inverse_document_frequency(df, N): +def inverse_document_frequency(df : int, N: int) -> int: """ A function that returns an integer denoting the importance of a word. This measure of importance is calculated by log10(N/df), where N is the number of documents and df is the Document Frequency. - @params : df, the Document Frequency, and corpus, - a collection of documents separated - by a newline. + @params : df, the Document Frequency, and N, + the number of documents in the corpus. @returns : log10(N/df) @examples : >>> df = 1 - >>> corpus = - "This is the first document in the corpus.\n - ThIs is the second document in the corpus.\n - THIS is the third document in the corpus." + >>> N = 3 log10(3/1) = .477 >>> df = 3 log10(3/3) = log10(1) = 0 @@ -122,7 +127,7 @@ def inverse_document_frequency(df, N): print("The term you searched for is not in the corpus.") -def tf_idf(tf, idf): +def tf_idf(tf : int, idf: int) -> int: """ A function that combines the term frequency and inverse document frequency functions to From eb260b0b3664a17a8e3e8c6ddfb827849f8ab8d4 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Mon, 22 Jun 2020 17:24:53 -0400 Subject: [PATCH 03/19] Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index a9e8211543cd..15c6d8153b4c 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -51,7 +51,7 @@ def term_frequency(term : str, document : str) -> int: @examples: >>> document = "To be, or not to be" - >>> term = "to" + >>> term_frequency("to", "To be, or not to be") 2 >>> document = "Natural Language Processing is a subfield of Artificial Intelligence From e961f523b864dd8a87d7b7394e39e76ad56fb26a Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Mon, 22 Jun 2020 17:25:02 -0400 Subject: [PATCH 04/19] Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index 15c6d8153b4c..8123b6479b74 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -54,7 +54,7 @@ def term_frequency(term : str, document : str) -> int: >>> term_frequency("to", "To be, or not to be") 2 - >>> document = "Natural Language Processing is a subfield of Artificial Intelligence + >>> document = ("Natural Language Processing is a subfield of Artificial Intelligence " concerned with interactions between computers and human languages" >>> term = "NLP" 0 From e6b2357cedbcdd362d568bfacbde8cbfa3798bbd Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Mon, 22 Jun 2020 17:25:13 -0400 Subject: [PATCH 05/19] Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index 8123b6479b74..0b96b2e0de92 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -55,7 +55,7 @@ def term_frequency(term : str, document : str) -> int: 2 >>> document = ("Natural Language Processing is a subfield of Artificial Intelligence " - concerned with interactions between computers and human languages" + ... "concerned with interactions between computers and human languages") >>> term = "NLP" 0 """ From aa61ec8247eff5145823e5d2de9e3915d6cdbd45 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Mon, 22 Jun 2020 17:25:22 -0400 Subject: [PATCH 06/19] Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index 0b96b2e0de92..be78b78b01b3 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -56,7 +56,7 @@ def term_frequency(term : str, document : str) -> int: >>> document = ("Natural Language Processing is a subfield of Artificial Intelligence " ... "concerned with interactions between computers and human languages") - >>> term = "NLP" + >>> term_frequency("NLP", document) 0 """ # strip all punctuation and newlines and replace it with '' From bed579d82575627e5ffb04da76aafc2414bfae11 Mon Sep 17 00:00:00 2001 From: = Date: Mon, 22 Jun 2020 17:44:12 -0400 Subject: [PATCH 07/19] Fix line length for flake8 --- machine_learning/word_frequency_functions.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index a9e8211543cd..684bcad7c69c 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -54,8 +54,9 @@ def term_frequency(term : str, document : str) -> int: >>> term = "to" 2 - >>> document = "Natural Language Processing is a subfield of Artificial Intelligence - concerned with interactions between computers and human languages" + >>> document = "Natural Language Processing is a subfield of + Artificial Intelligence concerned with interactions + between computers and human languages" >>> term = "NLP" 0 """ From 9ef8e626947c4471efd62f73c596603800112a87 Mon Sep 17 00:00:00 2001 From: = Date: Mon, 22 Jun 2020 17:52:15 -0400 Subject: [PATCH 08/19] Fix line length for flake8 V2 --- machine_learning/word_frequency_functions.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index 073d522431d5..eb0f311b7ed3 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -54,16 +54,10 @@ def term_frequency(term : str, document : str) -> int: >>> term_frequency("to", "To be, or not to be") 2 -<<<<<<< HEAD >>> document = "Natural Language Processing is a subfield of Artificial Intelligence concerned with interactions between computers and human languages" >>> term = "NLP" -======= - >>> document = ("Natural Language Processing is a subfield of Artificial Intelligence " - ... "concerned with interactions between computers and human languages") - >>> term_frequency("NLP", document) ->>>>>>> aa61ec8247eff5145823e5d2de9e3915d6cdbd45 0 """ # strip all punctuation and newlines and replace it with '' From 1152eddfc13313aa7cfd2d7d6c4b037e567f46be Mon Sep 17 00:00:00 2001 From: = Date: Mon, 22 Jun 2020 18:44:05 -0400 Subject: [PATCH 09/19] Add line escapes and change int to float --- machine_learning/word_frequency_functions.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index eb0f311b7ed3..f877dd839425 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -54,8 +54,8 @@ def term_frequency(term : str, document : str) -> int: >>> term_frequency("to", "To be, or not to be") 2 - >>> document = "Natural Language Processing is a subfield of - Artificial Intelligence concerned with interactions + >>> document = "Natural Language Processing is a subfield of \ + Artificial Intelligence concerned with interactions \ between computers and human languages" >>> term = "NLP" 0 @@ -80,7 +80,7 @@ def document_frequency(term: str, corpus: str) -> int: @returns : the number of documents in the corpus that contain the term you are searching for and the number of documents in the corpus @examples : - >>> corpus = + >>> corpus = \ "This is the first document in the corpus.\n ThIs is the second document in the corpus.\n THIS is the third document in the corpus." @@ -102,7 +102,7 @@ def document_frequency(term: str, corpus: str) -> int: return document_frequency, len(documents) -def inverse_document_frequency(df : int, N: int) -> int: +def inverse_document_frequency(df : int, N: int) -> float: """ A function that returns an integer denoting the importance of a word. This measure of importance is @@ -128,7 +128,7 @@ def inverse_document_frequency(df : int, N: int) -> int: print("The term you searched for is not in the corpus.") -def tf_idf(tf : int, idf: int) -> int: +def tf_idf(tf : int, idf: int) -> float: """ A function that combines the term frequency and inverse document frequency functions to From e8890d62f388129ac85bddb63efaf22be44adc56 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Mon, 22 Jun 2020 20:27:11 -0400 Subject: [PATCH 10/19] Corrected doctests --- machine_learning/word_frequency_functions.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index f877dd839425..033a1ad47365 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -53,12 +53,6 @@ def term_frequency(term : str, document : str) -> int: >>> document = "To be, or not to be" >>> term_frequency("to", "To be, or not to be") 2 - - >>> document = "Natural Language Processing is a subfield of \ - Artificial Intelligence concerned with interactions \ - between computers and human languages" - >>> term = "NLP" - 0 """ # strip all punctuation and newlines and replace it with '' document_without_punctuation = document.translate( @@ -81,8 +75,8 @@ def document_frequency(term: str, corpus: str) -> int: searching for and the number of documents in the corpus @examples : >>> corpus = \ - "This is the first document in the corpus.\n - ThIs is the second document in the corpus.\n + "This is the first document in the corpus.\n \ + ThIs is the second document in the corpus.\n \ THIS is the third document in the corpus." >>> term = "first" 1 @@ -115,9 +109,9 @@ def inverse_document_frequency(df : int, N: int) -> float: @examples : >>> df = 1 >>> N = 3 - log10(3/1) = .477 + .477 >>> df = 3 - log10(3/3) = log10(1) = 0 + 0 >>> df = 0 log10(3/0) -> throws ZeroDivisionError """ From bcbb8f680ec415fc9deb9380e9fd0736afe843fc Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Mon, 22 Jun 2020 20:57:43 -0400 Subject: [PATCH 11/19] Fix for TravisCI --- machine_learning/word_frequency_functions.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index 033a1ad47365..d33adb0e8dec 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -75,9 +75,7 @@ def document_frequency(term: str, corpus: str) -> int: searching for and the number of documents in the corpus @examples : >>> corpus = \ - "This is the first document in the corpus.\n \ - ThIs is the second document in the corpus.\n \ - THIS is the third document in the corpus." + "This is the first document in the corpus.\n ThIs is the second document in the corpus. \n THIS is the third document in the corpus." >>> term = "first" 1 >>> term = "document" @@ -110,10 +108,6 @@ def inverse_document_frequency(df : int, N: int) -> float: >>> df = 1 >>> N = 3 .477 - >>> df = 3 - 0 - >>> df = 0 - log10(3/0) -> throws ZeroDivisionError """ try: idf = round(log10(N / df), 3) From a2628d47bcc6ade369b8bd9211c2a0ba56848834 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Mon, 22 Jun 2020 21:10:37 -0400 Subject: [PATCH 12/19] Fix for TravisCI V2 --- machine_learning/word_frequency_functions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index d33adb0e8dec..22627e20e018 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -75,7 +75,9 @@ def document_frequency(term: str, corpus: str) -> int: searching for and the number of documents in the corpus @examples : >>> corpus = \ - "This is the first document in the corpus.\n ThIs is the second document in the corpus. \n THIS is the third document in the corpus." + "This is the first document in the corpus.\n ThIs is \ + the second document in the corpus. \n THIS is \ + the third document in the corpus." >>> term = "first" 1 >>> term = "document" From a0bef59b3aa392a40e7afbb93e10198ac76aebf4 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Tue, 23 Jun 2020 11:42:37 -0400 Subject: [PATCH 13/19] Tests passing locally --- machine_learning/word_frequency_functions.py | 23 ++++++++------------ 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index 22627e20e018..7abcbf2b4f08 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -50,7 +50,6 @@ def term_frequency(term : str, document : str) -> int: found within the document @examples: - >>> document = "To be, or not to be" >>> term_frequency("to", "To be, or not to be") 2 """ @@ -74,16 +73,10 @@ def document_frequency(term: str, corpus: str) -> int: @returns : the number of documents in the corpus that contain the term you are searching for and the number of documents in the corpus @examples : - >>> corpus = \ - "This is the first document in the corpus.\n ThIs is \ - the second document in the corpus. \n THIS is \ - the third document in the corpus." - >>> term = "first" - 1 - >>> term = "document" - 3 - >>> term = "this" - 3 + >>> document_frequency("first", "This is the first document in the corpus.\\nThIs is\ +the second document in the corpus.\\nTHIS is \ +the third document in the corpus.") + (1, 3) """ corpus_without_punctuation = corpus.translate( str.maketrans("", "", string.punctuation) @@ -107,9 +100,8 @@ def inverse_document_frequency(df : int, N: int) -> float: the number of documents in the corpus. @returns : log10(N/df) @examples : - >>> df = 1 - >>> N = 3 - .477 + >>> inverse_document_frequency(1, 3) + 0.477 """ try: idf = round(log10(N / df), 3) @@ -128,5 +120,8 @@ def tf_idf(tf : int, idf: int) -> float: frequency : tf-idf = TF * IDF @params : tf, the term frequency, and idf, the inverse document frequency + @examples : + >>> tf_idf(2, 0.477) + 0.954 """ return round(tf * idf, 3) From 4cd803ae586e22fab762cf822d3dfdb40922a101 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Tue, 23 Jun 2020 11:49:36 -0400 Subject: [PATCH 14/19] Tests passing locally --- machine_learning/word_frequency_functions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index 7abcbf2b4f08..acf72f80f4b8 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -73,8 +73,8 @@ def document_frequency(term: str, corpus: str) -> int: @returns : the number of documents in the corpus that contain the term you are searching for and the number of documents in the corpus @examples : - >>> document_frequency("first", "This is the first document in the corpus.\\nThIs is\ -the second document in the corpus.\\nTHIS is \ + >>> document_frequency("first", "This is the first document in the corpus.\\nThIs\ +is the second document in the corpus.\\nTHIS is \ the third document in the corpus.") (1, 3) """ @@ -120,7 +120,7 @@ def tf_idf(tf : int, idf: int) -> float: frequency : tf-idf = TF * IDF @params : tf, the term frequency, and idf, the inverse document frequency - @examples : + @examples : >>> tf_idf(2, 0.477) 0.954 """ From fcc07c930ecb616c27d028f98169406c8def7f52 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 24 Jun 2020 15:37:30 -0400 Subject: [PATCH 15/19] Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index acf72f80f4b8..6b3fc43f812d 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -66,7 +66,7 @@ def term_frequency(term : str, document : str) -> int: def document_frequency(term: str, corpus: str) -> int: """ - A function that calculates the number of documents in a corpus that contain a + Calculate the number of documents in a corpus that contain a given term @params : term, the term to search each document for, and corpus, a collection of documents. Each document should be separated by a newline. From d35b5a69048eb01393558bcc355a15c11a69d41a Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 24 Jun 2020 15:37:36 -0400 Subject: [PATCH 16/19] Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index 6b3fc43f812d..fc854fd5f67b 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -91,7 +91,7 @@ def document_frequency(term: str, corpus: str) -> int: def inverse_document_frequency(df : int, N: int) -> float: """ - A function that returns an integer denoting the importance + Return an integer denoting the importance of a word. This measure of importance is calculated by log10(N/df), where N is the number of documents and df is From e901e096204eb00176c79fa2b3e7dd8f49118cf4 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 24 Jun 2020 15:37:45 -0400 Subject: [PATCH 17/19] Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index fc854fd5f67b..faae5a81c6eb 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -42,7 +42,7 @@ def term_frequency(term : str, document : str) -> int: """ - A function that returns the number of times a term occurs within + Return the number of times a term occurs within a given document. @params: term, the term to search a document for, and document, the document to search within From 0a85c0fbe1375f17cbde80f14764c49c08db4cd8 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 24 Jun 2020 15:37:51 -0400 Subject: [PATCH 18/19] Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss --- machine_learning/word_frequency_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index faae5a81c6eb..4434a6de808b 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -112,7 +112,7 @@ def inverse_document_frequency(df : int, N: int) -> float: def tf_idf(tf : int, idf: int) -> float: """ - A function that combines the term frequency + Combine the term frequency and inverse document frequency functions to calculate the originality of a term. This 'originality' is calculated by multiplying From fcef21e5889f601279f9d269ddfa322a60e01ad0 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Wed, 24 Jun 2020 16:15:30 -0400 Subject: [PATCH 19/19] Add doctest examples and clean up docstrings --- machine_learning/word_frequency_functions.py | 34 ++++++++++++-------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/machine_learning/word_frequency_functions.py b/machine_learning/word_frequency_functions.py index acf72f80f4b8..a105e30f5d3b 100644 --- a/machine_learning/word_frequency_functions.py +++ b/machine_learning/word_frequency_functions.py @@ -42,7 +42,7 @@ def term_frequency(term : str, document : str) -> int: """ - A function that returns the number of times a term occurs within + Return the number of times a term occurs within a given document. @params: term, the term to search a document for, and document, the document to search within @@ -58,15 +58,14 @@ def term_frequency(term : str, document : str) -> int: str.maketrans("", "", string.punctuation) ).replace("\n", "") tokenize_document = document_without_punctuation.split(" ") # word tokenization - term_frequency = len( + return len( [word for word in tokenize_document if word.lower() == term.lower()] ) - return term_frequency def document_frequency(term: str, corpus: str) -> int: """ - A function that calculates the number of documents in a corpus that contain a + Calculate the number of documents in a corpus that contain a given term @params : term, the term to search each document for, and corpus, a collection of documents. Each document should be separated by a newline. @@ -83,15 +82,14 @@ def document_frequency(term: str, corpus: str) -> int: ) # strip all punctuation and replace it with '' documents = corpus_without_punctuation.split("\n") lowercase_documents = [document.lower() for document in documents] - document_frequency = len( + return len( [document for document in lowercase_documents if term.lower() in document] - ) # number of documents that contain the term - return document_frequency, len(documents) + ), len(documents) def inverse_document_frequency(df : int, N: int) -> float: """ - A function that returns an integer denoting the importance + Return an integer denoting the importance of a word. This measure of importance is calculated by log10(N/df), where N is the number of documents and df is @@ -100,19 +98,27 @@ def inverse_document_frequency(df : int, N: int) -> float: the number of documents in the corpus. @returns : log10(N/df) @examples : + >>> inverse_document_frequency(3, 0) + Traceback (most recent call last): + ... + ValueError: log10(0) is undefined. >>> inverse_document_frequency(1, 3) 0.477 + >>> inverse_document_frequency(0, 3) + Traceback (most recent call last): + ... + ZeroDivisionError: df must be > 0 """ - try: - idf = round(log10(N / df), 3) - return idf - except ZeroDivisionError: - print("The term you searched for is not in the corpus.") + if df == 0: + raise ZeroDivisionError("df must be > 0") + elif N == 0: + raise ValueError("log10(0) is undefined.") + return round(log10(N / df), 3) def tf_idf(tf : int, idf: int) -> float: """ - A function that combines the term frequency + Combine the term frequency and inverse document frequency functions to calculate the originality of a term. This 'originality' is calculated by multiplying