From 3e5e9a38fcafa08cbef929bc8f12544b7800100a Mon Sep 17 00:00:00 2001 From: boyun Date: Wed, 11 Nov 2020 23:50:29 +0900 Subject: [PATCH 01/12] add crawl_google_scholar_citation.py --- .../crawl_google_scholar_citation.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 web_programming/crawl_google_scholar_citation.py diff --git a/web_programming/crawl_google_scholar_citation.py b/web_programming/crawl_google_scholar_citation.py new file mode 100644 index 000000000000..e639741d532c --- /dev/null +++ b/web_programming/crawl_google_scholar_citation.py @@ -0,0 +1,64 @@ + +""" +Get the citation from google scholar +using title and year of publication, and volume and pages of journal. +""" + +from bs4 import BeautifulSoup +import requests +import re + + +def create_url(title: str, journal: str, volume: str, page: str, year: str) -> str: + """ + Return the url. + """ + url = f"http://scholar.google.com/scholar_lookup?hl=en&title={title}&journal={journal}&volume={volume}&pages={page}&publication_year={year}" + url = remove_tag(url) + return url.replace(" ", "%") + + +def remove_tag(url: str) -> str: + """ + Return the url removed the html tags. + """ + tag = re.compile('<.*?>') + clean_url = re.sub(tag, '', url) + return clean_url + + +def get_citation(url: str) -> str: + """ + Return the citation number. + """ + url = requests.get(url).text + soup = BeautifulSoup(url, "html.parser") + get_div = soup.find(u"div", attrs={u"class": u"gs_ri"}) + get_a_tag = get_div.find(u"div", attrs={u"class": u"gs_fl"}).findAll('a') + citation = get_a_tag[2].get_text() + if 'Cited' not in citation: + citation = 'Cited by 0' + + return citation.replace("Cited by ", "") + + +if __name__ == '__main__': + """ + You have to fill following values: title, journal_name, volume, page, year. + + For example, + title = "Precisely geometry controlled microsupercapacitors for ultrahigh areal capacitance, volumetric capacitance, and energy density" + journal_name = "Chem. Mater" + volume = "30" + page = "3979-3990" + year = "2018" + """ + title = "" + journal_name = "" + volume = "" + page = "" + year = "" + + citation = get_citation(create_url(title, journal_name, volume, page, year)) + print(citation) + From 47efd43125e7efbd790b645ef1aaf9a938209421 Mon Sep 17 00:00:00 2001 From: boyun Date: Thu, 12 Nov 2020 00:07:10 +0900 Subject: [PATCH 02/12] pass flack8 --- .../crawl_google_scholar_citation.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/web_programming/crawl_google_scholar_citation.py b/web_programming/crawl_google_scholar_citation.py index e639741d532c..1fb52efa5dd7 100644 --- a/web_programming/crawl_google_scholar_citation.py +++ b/web_programming/crawl_google_scholar_citation.py @@ -13,13 +13,19 @@ def create_url(title: str, journal: str, volume: str, page: str, year: str) -> s """ Return the url. """ - url = f"http://scholar.google.com/scholar_lookup?hl=en&title={title}&journal={journal}&volume={volume}&pages={page}&publication_year={year}" + url = f"http://scholar.google.com/scholar_lookup?hl=en&" \ + f"title={title}" \ + f"&journal={journal}" \ + f"&volume={volume}" \ + f"&pages={page}" \ + f"&publication_year={year}" url = remove_tag(url) return url.replace(" ", "%") def remove_tag(url: str) -> str: """ + Remove the html tags in 'url'. Return the url removed the html tags. """ tag = re.compile('<.*?>') @@ -44,14 +50,13 @@ def get_citation(url: str) -> str: if __name__ == '__main__': """ - You have to fill following values: title, journal_name, volume, page, year. - + You have to fill following values: title, journal_name, volume, page, year. For example, - title = "Precisely geometry controlled microsupercapacitors for ultrahigh areal capacitance, volumetric capacitance, and energy density" - journal_name = "Chem. Mater" + title = "abcde" + journal_name = "fgh" volume = "30" page = "3979-3990" - year = "2018" + year = "2020" """ title = "" journal_name = "" @@ -61,4 +66,3 @@ def get_citation(url: str) -> str: citation = get_citation(create_url(title, journal_name, volume, page, year)) print(citation) - From e08b235bbe089027f9ee16901ae0e44b83036c05 Mon Sep 17 00:00:00 2001 From: boyun Date: Thu, 12 Nov 2020 00:28:42 +0900 Subject: [PATCH 03/12] pass isort --- web_programming/crawl_google_results.py | 1 + 1 file changed, 1 insertion(+) diff --git a/web_programming/crawl_google_results.py b/web_programming/crawl_google_results.py index a33a3f3bbe5c..caacea52c3e5 100644 --- a/web_programming/crawl_google_results.py +++ b/web_programming/crawl_google_results.py @@ -3,6 +3,7 @@ import requests from bs4 import BeautifulSoup + from fake_useragent import UserAgent if __name__ == "__main__": From a614c9d4b823c6caa73df8d17fdd17df68d6da6a Mon Sep 17 00:00:00 2001 From: boyun Date: Thu, 12 Nov 2020 00:33:33 +0900 Subject: [PATCH 04/12] pass isort --- .../crawl_google_scholar_citation.py | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/web_programming/crawl_google_scholar_citation.py b/web_programming/crawl_google_scholar_citation.py index 1fb52efa5dd7..a45a20610a43 100644 --- a/web_programming/crawl_google_scholar_citation.py +++ b/web_programming/crawl_google_scholar_citation.py @@ -1,24 +1,26 @@ - """ Get the citation from google scholar using title and year of publication, and volume and pages of journal. """ -from bs4 import BeautifulSoup -import requests import re +import requests +from bs4 import BeautifulSoup + def create_url(title: str, journal: str, volume: str, page: str, year: str) -> str: """ Return the url. """ - url = f"http://scholar.google.com/scholar_lookup?hl=en&" \ - f"title={title}" \ - f"&journal={journal}" \ - f"&volume={volume}" \ - f"&pages={page}" \ - f"&publication_year={year}" + url = ( + f"http://scholar.google.com/scholar_lookup?hl=en&" + f"title={title}" + f"&journal={journal}" + f"&volume={volume}" + f"&pages={page}" + f"&publication_year={year}" + ) url = remove_tag(url) return url.replace(" ", "%") @@ -28,8 +30,8 @@ def remove_tag(url: str) -> str: Remove the html tags in 'url'. Return the url removed the html tags. """ - tag = re.compile('<.*?>') - clean_url = re.sub(tag, '', url) + tag = re.compile("<.*?>") + clean_url = re.sub(tag, "", url) return clean_url @@ -39,20 +41,20 @@ def get_citation(url: str) -> str: """ url = requests.get(url).text soup = BeautifulSoup(url, "html.parser") - get_div = soup.find(u"div", attrs={u"class": u"gs_ri"}) - get_a_tag = get_div.find(u"div", attrs={u"class": u"gs_fl"}).findAll('a') + get_div = soup.find("div", attrs={"class": "gs_ri"}) + get_a_tag = get_div.find("div", attrs={"class": "gs_fl"}).findAll("a") citation = get_a_tag[2].get_text() - if 'Cited' not in citation: - citation = 'Cited by 0' + if "Cited" not in citation: + citation = "Cited by 0" return citation.replace("Cited by ", "") -if __name__ == '__main__': +if __name__ == "__main__": """ You have to fill following values: title, journal_name, volume, page, year. For example, - title = "abcde" + title = "abc de" journal_name = "fgh" volume = "30" page = "3979-3990" From 0ee618ff24bb47831ae8caa70236e2a66f06bd27 Mon Sep 17 00:00:00 2001 From: boyun Date: Thu, 12 Nov 2020 09:58:28 +0900 Subject: [PATCH 05/12] change comment in main --- web_programming/crawl_google_scholar_citation.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/web_programming/crawl_google_scholar_citation.py b/web_programming/crawl_google_scholar_citation.py index a45a20610a43..ba11875a6f77 100644 --- a/web_programming/crawl_google_scholar_citation.py +++ b/web_programming/crawl_google_scholar_citation.py @@ -53,12 +53,6 @@ def get_citation(url: str) -> str: if __name__ == "__main__": """ You have to fill following values: title, journal_name, volume, page, year. - For example, - title = "abc de" - journal_name = "fgh" - volume = "30" - page = "3979-3990" - year = "2020" """ title = "" journal_name = "" From 14c55a795d8a3b75fc1063514e7c757932098457 Mon Sep 17 00:00:00 2001 From: boyun Date: Fri, 13 Nov 2020 19:39:27 +0900 Subject: [PATCH 06/12] modify main code --- web_programming/crawl_google_scholar_citation.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/web_programming/crawl_google_scholar_citation.py b/web_programming/crawl_google_scholar_citation.py index ba11875a6f77..1e582a284c31 100644 --- a/web_programming/crawl_google_scholar_citation.py +++ b/web_programming/crawl_google_scholar_citation.py @@ -54,11 +54,15 @@ def get_citation(url: str) -> str: """ You have to fill following values: title, journal_name, volume, page, year. """ - title = "" - journal_name = "" - volume = "" - page = "" - year = "" + title = ( + "Precisely geometry controlled microsupercapacitors" + " for ultrahigh areal capacitance," + " volumetric capacitance, and energy density" + ) + journal_name = "Chem. Mater." + volume = "30" + page = "3979-3990" + year = "2018" citation = get_citation(create_url(title, journal_name, volume, page, year)) print(citation) From a97c312b086976c9e6b1a60308b58f4b08aa2859 Mon Sep 17 00:00:00 2001 From: boyun Date: Fri, 13 Nov 2020 19:55:29 +0900 Subject: [PATCH 07/12] delete file --- web_programming/crawl_google_results.py | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100644 web_programming/crawl_google_results.py diff --git a/web_programming/crawl_google_results.py b/web_programming/crawl_google_results.py deleted file mode 100644 index caacea52c3e5..000000000000 --- a/web_programming/crawl_google_results.py +++ /dev/null @@ -1,25 +0,0 @@ -import sys -import webbrowser - -import requests -from bs4 import BeautifulSoup - -from fake_useragent import UserAgent - -if __name__ == "__main__": - print("Googling.....") - url = "https://www.google.com/search?q=" + " ".join(sys.argv[1:]) - res = requests.get(url, headers={"UserAgent": UserAgent().random}) - # res.raise_for_status() - with open("project1a.html", "wb") as out_file: # only for knowing the class - for data in res.iter_content(10000): - out_file.write(data) - soup = BeautifulSoup(res.text, "html.parser") - links = list(soup.select(".eZt8xd"))[:5] - - print(len(links)) - for link in links: - if link.text == "Maps": - webbrowser.open(link.get("href")) - else: - webbrowser.open(f"http://google.com{link.get('href')}") From 6c3d24d3880497471211b184c74c8b18a5f3ec8c Mon Sep 17 00:00:00 2001 From: boyun Date: Fri, 13 Nov 2020 20:35:42 +0900 Subject: [PATCH 08/12] change how to build url --- .../crawl_google_scholar_citation.py | 61 +++++-------------- 1 file changed, 15 insertions(+), 46 deletions(-) diff --git a/web_programming/crawl_google_scholar_citation.py b/web_programming/crawl_google_scholar_citation.py index 1e582a284c31..22c73f20cc99 100644 --- a/web_programming/crawl_google_scholar_citation.py +++ b/web_programming/crawl_google_scholar_citation.py @@ -3,44 +3,15 @@ using title and year of publication, and volume and pages of journal. """ -import re - import requests from bs4 import BeautifulSoup -def create_url(title: str, journal: str, volume: str, page: str, year: str) -> str: - """ - Return the url. - """ - url = ( - f"http://scholar.google.com/scholar_lookup?hl=en&" - f"title={title}" - f"&journal={journal}" - f"&volume={volume}" - f"&pages={page}" - f"&publication_year={year}" - ) - url = remove_tag(url) - return url.replace(" ", "%") - - -def remove_tag(url: str) -> str: - """ - Remove the html tags in 'url'. - Return the url removed the html tags. - """ - tag = re.compile("<.*?>") - clean_url = re.sub(tag, "", url) - return clean_url - - -def get_citation(url: str) -> str: +def get_citation(base_url: str, params: dict) -> str: """ Return the citation number. """ - url = requests.get(url).text - soup = BeautifulSoup(url, "html.parser") + soup = BeautifulSoup(requests.get(base_url, params=params).content, "html.parser") get_div = soup.find("div", attrs={"class": "gs_ri"}) get_a_tag = get_div.find("div", attrs={"class": "gs_fl"}).findAll("a") citation = get_a_tag[2].get_text() @@ -51,18 +22,16 @@ def get_citation(url: str) -> str: if __name__ == "__main__": - """ - You have to fill following values: title, journal_name, volume, page, year. - """ - title = ( - "Precisely geometry controlled microsupercapacitors" - " for ultrahigh areal capacitance," - " volumetric capacitance, and energy density" - ) - journal_name = "Chem. Mater." - volume = "30" - page = "3979-3990" - year = "2018" - - citation = get_citation(create_url(title, journal_name, volume, page, year)) - print(citation) + params = { + "title": ( + "Precisely geometry controlled microsupercapacitors" + " for ultrahigh areal capacitance," + " volumetric capacitance, and energy density" + ), + "journal_name": "Chem. Mater.", + "volume": "30", + "page": "3979-3990", + "year": "2018", + } + + print(get_citation("http://scholar.google.com/scholar_lookup?hl=en&", params=params)) From 3ee5392092162d3733f59eea8dbc976c80349042 Mon Sep 17 00:00:00 2001 From: boyun Date: Fri, 13 Nov 2020 21:49:45 +0900 Subject: [PATCH 09/12] add a key 'hl' in params dict --- web_programming/crawl_google_scholar_citation.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/web_programming/crawl_google_scholar_citation.py b/web_programming/crawl_google_scholar_citation.py index 22c73f20cc99..4a96c9c7154e 100644 --- a/web_programming/crawl_google_scholar_citation.py +++ b/web_programming/crawl_google_scholar_citation.py @@ -15,10 +15,8 @@ def get_citation(base_url: str, params: dict) -> str: get_div = soup.find("div", attrs={"class": "gs_ri"}) get_a_tag = get_div.find("div", attrs={"class": "gs_fl"}).findAll("a") citation = get_a_tag[2].get_text() - if "Cited" not in citation: - citation = "Cited by 0" - return citation.replace("Cited by ", "") + return citation if __name__ == "__main__": @@ -28,10 +26,12 @@ def get_citation(base_url: str, params: dict) -> str: " for ultrahigh areal capacitance," " volumetric capacitance, and energy density" ), - "journal_name": "Chem. Mater.", + "journal": "Chem. Mater.", "volume": "30", - "page": "3979-3990", + "pages": "3979-3990", "year": "2018", + "hl": "en" + , } - print(get_citation("http://scholar.google.com/scholar_lookup?hl=en&", params=params)) + print(get_citation("http://scholar.google.com/scholar_lookup", params=params)) From 7cf27f6c5d3d263188fd52dc6c418528b173b91c Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 13 Nov 2020 14:43:57 +0100 Subject: [PATCH 10/12] Update crawl_google_scholar_citation.py --- .../crawl_google_scholar_citation.py | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/web_programming/crawl_google_scholar_citation.py b/web_programming/crawl_google_scholar_citation.py index 4a96c9c7154e..d023380c0818 100644 --- a/web_programming/crawl_google_scholar_citation.py +++ b/web_programming/crawl_google_scholar_citation.py @@ -12,26 +12,21 @@ def get_citation(base_url: str, params: dict) -> str: Return the citation number. """ soup = BeautifulSoup(requests.get(base_url, params=params).content, "html.parser") - get_div = soup.find("div", attrs={"class": "gs_ri"}) - get_a_tag = get_div.find("div", attrs={"class": "gs_fl"}).findAll("a") - citation = get_a_tag[2].get_text() - - return citation + div = soup.find("div", attrs={"class": "gs_ri"}) + anchors = div.find("div", attrs={"class": "gs_fl"}).find_all("a") + return anchors[2].get_text() if __name__ == "__main__": params = { "title": ( - "Precisely geometry controlled microsupercapacitors" - " for ultrahigh areal capacitance," - " volumetric capacitance, and energy density" + "Precisely geometry controlled microsupercapacitors for ultrahigh areal " + "capacitance, volumetric capacitance, and energy density" ), "journal": "Chem. Mater.", - "volume": "30", + "volume": 30, "pages": "3979-3990", - "year": "2018", - "hl": "en" - , + "year": 2018, + "hl": "en", } - print(get_citation("http://scholar.google.com/scholar_lookup", params=params)) From d0b60d276d144b78fce183506acb0c481d010352 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 13 Nov 2020 14:46:50 +0100 Subject: [PATCH 11/12] Create crawl_google_results.py --- web_programming/crawl_google_results.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 web_programming/crawl_google_results.py diff --git a/web_programming/crawl_google_results.py b/web_programming/crawl_google_results.py new file mode 100644 index 000000000000..a33a3f3bbe5c --- /dev/null +++ b/web_programming/crawl_google_results.py @@ -0,0 +1,24 @@ +import sys +import webbrowser + +import requests +from bs4 import BeautifulSoup +from fake_useragent import UserAgent + +if __name__ == "__main__": + print("Googling.....") + url = "https://www.google.com/search?q=" + " ".join(sys.argv[1:]) + res = requests.get(url, headers={"UserAgent": UserAgent().random}) + # res.raise_for_status() + with open("project1a.html", "wb") as out_file: # only for knowing the class + for data in res.iter_content(10000): + out_file.write(data) + soup = BeautifulSoup(res.text, "html.parser") + links = list(soup.select(".eZt8xd"))[:5] + + print(len(links)) + for link in links: + if link.text == "Maps": + webbrowser.open(link.get("href")) + else: + webbrowser.open(f"http://google.com{link.get('href')}") From 0a206eca1bb6d0a7e58e10839bdc4e468929d816 Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Fri, 13 Nov 2020 14:52:17 +0100 Subject: [PATCH 12/12] codespell: Mater --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 01da6cad0335..a3288e1c5eef 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -42,7 +42,7 @@ repos: hooks: - id: codespell args: - - --ignore-words-list=ans,fo,followings,hist,iff,secant,som,tim + - --ignore-words-list=ans,fo,followings,hist,iff,mater,secant,som,tim - --skip="./.*,./other/dictionary.txt,./other/words,./project_euler/problem_022/p022_names.txt" - --quiet-level=2 exclude: |