Skip to content

Commit 642ab57

Browse files
committed
Ability to generate publications from bibtex files.
Initial pass to generate publications from .bib files.
1 parent 072b90b commit 642ab57

File tree

2 files changed

+383
-0
lines changed

2 files changed

+383
-0
lines changed

markdown_generator/PubsFromBib.ipynb

+223
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Publications markdown generator for academicpages\n",
8+
"\n",
9+
"Takes a set of bibtex of publications and converts them for use with [academicpages.github.io](academicpages.github.io). This is an interactive Jupyter notebook ([see more info here](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html)). \n",
10+
"\n",
11+
"The core python code is also in `pubsFromBibs.py`. \n",
12+
"Run either from the `markdown_generator` folder after replacing updating the publist dictionary with:\n",
13+
"* bib file names\n",
14+
"* specific venue keys based on your bib file preferences\n",
15+
"* any specific pre-text for specific files\n",
16+
"* Collection Name (future feature)\n",
17+
"\n",
18+
"TODO: Make this work with other databases of citations, \n",
19+
"TODO: Merge this with the existing TSV parsing solution"
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": null,
25+
"metadata": {},
26+
"outputs": [],
27+
"source": [
28+
"from pybtex.database.input import bibtex\n",
29+
"import pybtex.database.input.bibtex \n",
30+
"from time import strptime\n",
31+
"import string\n",
32+
"import html\n",
33+
"import os\n",
34+
"import re"
35+
]
36+
},
37+
{
38+
"cell_type": "code",
39+
"execution_count": null,
40+
"metadata": {},
41+
"outputs": [],
42+
"source": [
43+
"#todo: incorporate different collection types rather than a catch all publications, requires other changes to template\n",
44+
"publist = {\n",
45+
" \"proceeding\": {\n",
46+
" \"file\" : \"proceedings.bib\",\n",
47+
" \"venuekey\": \"booktitle\",\n",
48+
" \"venue-pretext\": \"In the proceedings of \",\n",
49+
" \"collection\" : {\"name\":\"publications\",\n",
50+
" \"permalink\":\"/publication/\"}\n",
51+
" \n",
52+
" },\n",
53+
" \"journal\":{\n",
54+
" \"file\": \"pubs.bib\",\n",
55+
" \"venuekey\" : \"journal\",\n",
56+
" \"venue-pretext\" : \"\",\n",
57+
" \"collection\" : {\"name\":\"publications\",\n",
58+
" \"permalink\":\"/publication/\"}\n",
59+
" } \n",
60+
"}"
61+
]
62+
},
63+
{
64+
"cell_type": "code",
65+
"execution_count": null,
66+
"metadata": {},
67+
"outputs": [],
68+
"source": [
69+
"html_escape_table = {\n",
70+
" \"&\": \"&\",\n",
71+
" '\"': \""\",\n",
72+
" \"'\": \"'\"\n",
73+
" }\n",
74+
"\n",
75+
"def html_escape(text):\n",
76+
" \"\"\"Produce entities within text.\"\"\"\n",
77+
" return \"\".join(html_escape_table.get(c,c) for c in text)"
78+
]
79+
},
80+
{
81+
"cell_type": "code",
82+
"execution_count": null,
83+
"metadata": {
84+
"scrolled": false
85+
},
86+
"outputs": [],
87+
"source": [
88+
"for pubsource in publist:\n",
89+
" parser = bibtex.Parser()\n",
90+
" bibdata = parser.parse_file(publist[pubsource][\"file\"])\n",
91+
"\n",
92+
" #loop through the individual references in a given bibtex file\n",
93+
" for bib_id in bibdata.entries:\n",
94+
" #reset default date\n",
95+
" pub_year = \"1900\"\n",
96+
" pub_month = \"01\"\n",
97+
" pub_day = \"01\"\n",
98+
" \n",
99+
" b = bibdata.entries[bib_id].fields\n",
100+
" \n",
101+
" try:\n",
102+
" pub_year = f'{b[\"year\"]}'\n",
103+
"\n",
104+
" #todo: this hack for month and day needs some cleanup\n",
105+
" if \"month\" in b.keys(): \n",
106+
" if(len(b[\"month\"])<3):\n",
107+
" pub_month = \"0\"+b[\"month\"]\n",
108+
" pub_month = pub_month[-2:]\n",
109+
" elif(b[\"month\"] not in range(12)):\n",
110+
" tmnth = strptime(b[\"month\"][:3],'%b').tm_mon \n",
111+
" pub_month = \"{:02d}\".format(tmnth) \n",
112+
" else:\n",
113+
" pub_month = str(b[\"month\"])\n",
114+
" if \"day\" in b.keys(): \n",
115+
" pub_day = str(b[\"day\"])\n",
116+
"\n",
117+
" \n",
118+
" pub_date = pub_year+\"-\"+pub_month+\"-\"+pub_day\n",
119+
" \n",
120+
" #strip out {} as needed (some bibtex entries that maintain formatting)\n",
121+
" clean_title = b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\").replace(\" \",\"-\") \n",
122+
"\n",
123+
" url_slug = re.sub(\"\\\\[.*\\\\]|[^a-zA-Z0-9_-]\", \"\", clean_title)\n",
124+
" url_slug = url_slug.replace(\"--\",\"-\")\n",
125+
"\n",
126+
" md_filename = (str(pub_date) + \"-\" + url_slug + \".md\").replace(\"--\",\"-\")\n",
127+
" html_filename = (str(pub_date) + \"-\" + url_slug).replace(\"--\",\"-\")\n",
128+
"\n",
129+
" #Build Citation from text\n",
130+
" citation = \"\"\n",
131+
"\n",
132+
" #citation authors - todo - add highlighting for primary author?\n",
133+
" for author in bibdata.entries[bib_id].persons[\"author\"]:\n",
134+
" citation = citation+\" \"+author.first_names[0]+\" \"+author.last_names[0]+\", \"\n",
135+
"\n",
136+
" #citation title\n",
137+
" citation = citation + \"\\\"\" + html_escape(b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")) + \".\\\"\"\n",
138+
"\n",
139+
" #add venue logic depending on citation type\n",
140+
" venue = publist[pubsource][\"venue-pretext\"]+b[publist[pubsource][\"venuekey\"]].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")\n",
141+
"\n",
142+
" citation = citation + \" \" + html_escape(venue)\n",
143+
" citation = citation + \", \" + pub_year + \".\"\n",
144+
"\n",
145+
" \n",
146+
" ## YAML variables\n",
147+
" md = \"---\\ntitle: \\\"\" + html_escape(b[\"title\"].replace(\"{\", \"\").replace(\"}\",\"\").replace(\"\\\\\",\"\")) + '\"\\n'\n",
148+
" \n",
149+
" md += \"\"\"collection: \"\"\" + publist[pubsource][\"collection\"][\"name\"]\n",
150+
"\n",
151+
" md += \"\"\"\\npermalink: \"\"\" + publist[pubsource][\"collection\"][\"permalink\"] + html_filename\n",
152+
" \n",
153+
" note = False\n",
154+
" if \"note\" in b.keys():\n",
155+
" if len(str(b[\"note\"])) > 5:\n",
156+
" md += \"\\nexcerpt: '\" + html_escape(b[\"note\"]) + \"'\"\n",
157+
" note = True\n",
158+
"\n",
159+
" md += \"\\ndate: \" + str(pub_date) \n",
160+
"\n",
161+
" md += \"\\nvenue: '\" + html_escape(venue) + \"'\"\n",
162+
" \n",
163+
" url = False\n",
164+
" if \"url\" in b.keys():\n",
165+
" if len(str(b[\"url\"])) > 5:\n",
166+
" md += \"\\npaperurl: '\" + b[\"url\"] + \"'\"\n",
167+
" url = True\n",
168+
"\n",
169+
" md += \"\\ncitation: '\" + html_escape(citation) + \"'\"\n",
170+
"\n",
171+
" md += \"\\n---\"\n",
172+
"\n",
173+
" \n",
174+
" ## Markdown description for individual page\n",
175+
" if note:\n",
176+
" md += \"\\n\" + html_escape(b[\"note\"]) + \"\\n\"\n",
177+
"\n",
178+
" if url:\n",
179+
" md += \"\\n[Access paper here](\" + b[\"url\"] + \"){:target=\\\"_blank\\\"}\\n\" \n",
180+
" else:\n",
181+
" md += \"\\nUse [Google Scholar](https://scholar.google.com/scholar?q=\"+html.escape(clean_title.replace(\"-\",\"+\"))+\"){:target=\\\"_blank\\\"} for full citation\"\n",
182+
"\n",
183+
" md_filename = os.path.basename(md_filename)\n",
184+
"\n",
185+
" with open(\"../_publications/\" + md_filename, 'w') as f:\n",
186+
" f.write(md)\n",
187+
" print(f'SUCESSFULLY PARSED {bib_id}: \\\"', b[\"title\"][:60],\"...\"*(len(b['title'])>60),\"\\\"\")\n",
188+
" # field may not exist for a reference\n",
189+
" except KeyError as e:\n",
190+
" print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \\\"', b[\"title\"][:30],\"...\"*(len(b['title'])>30),\"\\\"\")\n",
191+
" continue\n"
192+
]
193+
},
194+
{
195+
"cell_type": "code",
196+
"execution_count": null,
197+
"metadata": {},
198+
"outputs": [],
199+
"source": []
200+
}
201+
],
202+
"metadata": {
203+
"kernelspec": {
204+
"display_name": "Python 3",
205+
"language": "python",
206+
"name": "python3"
207+
},
208+
"language_info": {
209+
"codemirror_mode": {
210+
"name": "ipython",
211+
"version": 3
212+
},
213+
"file_extension": ".py",
214+
"mimetype": "text/x-python",
215+
"name": "python",
216+
"nbconvert_exporter": "python",
217+
"pygments_lexer": "ipython3",
218+
"version": "3.7.1"
219+
}
220+
},
221+
"nbformat": 4,
222+
"nbformat_minor": 2
223+
}

markdown_generator/pubsFromBib.py

+160
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
4+
# # Publications markdown generator for academicpages
5+
#
6+
# Takes a set of bibtex of publications and converts them for use with [academicpages.github.io](academicpages.github.io). This is an interactive Jupyter notebook ([see more info here](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html)).
7+
#
8+
# The core python code is also in `pubsFromBibs.py`.
9+
# Run either from the `markdown_generator` folder after replacing updating the publist dictionary with:
10+
# * bib file names
11+
# * specific venue keys based on your bib file preferences
12+
# * any specific pre-text for specific files
13+
# * Collection Name (future feature)
14+
#
15+
# TODO: Make this work with other databases of citations,
16+
# TODO: Merge this with the existing TSV parsing solution
17+
18+
19+
from pybtex.database.input import bibtex
20+
import pybtex.database.input.bibtex
21+
from time import strptime
22+
import string
23+
import html
24+
import os
25+
import re
26+
27+
#todo: incorporate different collection types rather than a catch all publications, requires other changes to template
28+
publist = {
29+
"proceeding": {
30+
"file" : "proceedings.bib",
31+
"venuekey": "booktitle",
32+
"venue-pretext": "In the proceedings of ",
33+
"collection" : {"name":"publications",
34+
"permalink":"/publication/"}
35+
36+
},
37+
"journal":{
38+
"file": "pubs.bib",
39+
"venuekey" : "journal",
40+
"venue-pretext" : "",
41+
"collection" : {"name":"publications",
42+
"permalink":"/publication/"}
43+
}
44+
}
45+
46+
html_escape_table = {
47+
"&": "&amp;",
48+
'"': "&quot;",
49+
"'": "&apos;"
50+
}
51+
52+
def html_escape(text):
53+
"""Produce entities within text."""
54+
return "".join(html_escape_table.get(c,c) for c in text)
55+
56+
57+
for pubsource in publist:
58+
parser = bibtex.Parser()
59+
bibdata = parser.parse_file(publist[pubsource]["file"])
60+
61+
#loop through the individual references in a given bibtex file
62+
for bib_id in bibdata.entries:
63+
#reset default date
64+
pub_year = "1900"
65+
pub_month = "01"
66+
pub_day = "01"
67+
68+
b = bibdata.entries[bib_id].fields
69+
70+
try:
71+
pub_year = f'{b["year"]}'
72+
73+
#todo: this hack for month and day needs some cleanup
74+
if "month" in b.keys():
75+
if(len(b["month"])<3):
76+
pub_month = "0"+b["month"]
77+
pub_month = pub_month[-2:]
78+
elif(b["month"] not in range(12)):
79+
tmnth = strptime(b["month"][:3],'%b').tm_mon
80+
pub_month = "{:02d}".format(tmnth)
81+
else:
82+
pub_month = str(b["month"])
83+
if "day" in b.keys():
84+
pub_day = str(b["day"])
85+
86+
87+
pub_date = pub_year+"-"+pub_month+"-"+pub_day
88+
89+
#strip out {} as needed (some bibtex entries that maintain formatting)
90+
clean_title = b["title"].replace("{", "").replace("}","").replace("\\","").replace(" ","-")
91+
92+
url_slug = re.sub("\\[.*\\]|[^a-zA-Z0-9_-]", "", clean_title)
93+
url_slug = url_slug.replace("--","-")
94+
95+
md_filename = (str(pub_date) + "-" + url_slug + ".md").replace("--","-")
96+
html_filename = (str(pub_date) + "-" + url_slug).replace("--","-")
97+
98+
#Build Citation from text
99+
citation = ""
100+
101+
#citation authors - todo - add highlighting for primary author?
102+
for author in bibdata.entries[bib_id].persons["author"]:
103+
citation = citation+" "+author.first_names[0]+" "+author.last_names[0]+", "
104+
105+
#citation title
106+
citation = citation + "\"" + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + ".\""
107+
108+
#add venue logic depending on citation type
109+
venue = publist[pubsource]["venue-pretext"]+b[publist[pubsource]["venuekey"]].replace("{", "").replace("}","").replace("\\","")
110+
111+
citation = citation + " " + html_escape(venue)
112+
citation = citation + ", " + pub_year + "."
113+
114+
115+
## YAML variables
116+
md = "---\ntitle: \"" + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + '"\n'
117+
118+
md += """collection: """ + publist[pubsource]["collection"]["name"]
119+
120+
md += """\npermalink: """ + publist[pubsource]["collection"]["permalink"] + html_filename
121+
122+
note = False
123+
if "note" in b.keys():
124+
if len(str(b["note"])) > 5:
125+
md += "\nexcerpt: '" + html_escape(b["note"]) + "'"
126+
note = True
127+
128+
md += "\ndate: " + str(pub_date)
129+
130+
md += "\nvenue: '" + html_escape(venue) + "'"
131+
132+
url = False
133+
if "url" in b.keys():
134+
if len(str(b["url"])) > 5:
135+
md += "\npaperurl: '" + b["url"] + "'"
136+
url = True
137+
138+
md += "\ncitation: '" + html_escape(citation) + "'"
139+
140+
md += "\n---"
141+
142+
143+
## Markdown description for individual page
144+
if note:
145+
md += "\n" + html_escape(b["note"]) + "\n"
146+
147+
if url:
148+
md += "\n[Access paper here](" + b["url"] + "){:target=\"_blank\"}\n"
149+
else:
150+
md += "\nUse [Google Scholar](https://scholar.google.com/scholar?q="+html.escape(clean_title.replace("-","+"))+"){:target=\"_blank\"} for full citation"
151+
152+
md_filename = os.path.basename(md_filename)
153+
154+
with open("../_publications/" + md_filename, 'w') as f:
155+
f.write(md)
156+
print(f'SUCESSFULLY PARSED {bib_id}: \"', b["title"][:60],"..."*(len(b['title'])>60),"\"")
157+
# field may not exist for a reference
158+
except KeyError as e:
159+
print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \"', b["title"][:30],"..."*(len(b['title'])>30),"\"")
160+
continue

0 commit comments

Comments
 (0)