Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
eclipse
GitHub Repository: eclipse/sumo
Path: blob/main/tools/build_config/bibtexUpdate.py
169674 views
1
#!/usr/bin/env python
2
# Eclipse SUMO, Simulation of Urban MObility; see https://eclipse.dev/sumo
3
# Copyright (C) 2008-2025 German Aerospace Center (DLR) and others.
4
# This program and the accompanying materials are made available under the
5
# terms of the Eclipse Public License 2.0 which is available at
6
# https://www.eclipse.org/legal/epl-2.0/
7
# This Source Code may also be made available under the following Secondary
8
# Licenses when the conditions for such availability set forth in the Eclipse
9
# Public License 2.0 are satisfied: GNU General Public License, version 2
10
# or later which is available at
11
# https://www.gnu.org/licenses/old-licenses/gpl-2.0-standalone.html
12
# SPDX-License-Identifier: EPL-2.0 OR GPL-2.0-or-later
13
14
# @file bibtexUpdate.py
15
# @author Mirko Barthauer
16
# @date 2024-10-29
17
18
"""
19
This script queries meta data from Semantic Scholar about publications citing
20
one of the papers given by their unique (Semantic Scholar) identifier.
21
Then it writes the results to a text file in bibtex syntax. Optionally it can
22
read an existing bibtex file and integrate existing entries with the new ones,
23
keeping the old annotations / Jabref comments and adding new ones for the new
24
entries. It uses the JabRef group syntax version 3.
25
Moreover it can run a keyword discover algorithm to display the most used keywords in the papers' abstracts.
26
"""
27
28
29
# https://api.semanticscholar.org/graph/v1/paper/{paper_id}/citations /paper_id can be
30
# "DOI:10.18653/v1/N18-3011", Semantic Scholar hash "649def34f8be52c8b66281af98ae884c09aef38b" and others)
31
32
import os
33
import re
34
import time
35
import argparse
36
import requests
37
import bibtexparser
38
import unicodedata
39
from datetime import datetime
40
41
ArticleTypeMap = {
42
"JournalArticle": "Article",
43
"Conference": "InProceedings",
44
"Dataset": "Misc",
45
"Book": "Book",
46
"BookSection": "InBook"
47
}
48
49
50
def main(options):
51
if options.bibtexInput is None or not os.path.exists(options.bibtexInput):
52
print("The search results will not be merged with existing bibtex because no bibtex input file was found.")
53
54
offsetLimit = 1000 # maximum number of entries the citation request of Semantic Scholar can return
55
papers = []
56
citationRequestTemplate = "https://api.semanticscholar.org/graph/v1/paper/%s/citations%s"
57
delay = 1 # s
58
foundDOI = []
59
60
# loop through original works the result papers should cite
61
for originalDOI in options.citedWorks:
62
# send requests with the maximum offset but keep a delay between the requests
63
loop = True
64
offset = 0
65
while loop:
66
# remember call duration to send a limited amout of requests
67
callStart = datetime.now()
68
params = {
69
'offset': str(offset),
70
'limit': str(offsetLimit),
71
'fields': 'title,url,authors,year,externalIds,citationCount,abstract,venue,journal,publicationTypes',
72
}
73
fullRequest = citationRequestTemplate % (originalDOI, "" if len(params) == 0 else "?%s" % "&".join([
74
"%s=%s" % (key, val) for key, val in params.items()]))
75
print(fullRequest)
76
rsp = requests.get(fullRequest)
77
callEnd = datetime.now()
78
results = rsp.json()
79
if rsp.status_code == 400:
80
requestError = results["error"]
81
print(requestError)
82
elif rsp.status_code == 404:
83
print("The original paper with the identifier %s is not available at SemanticScholar." % originalDOI)
84
break
85
rsp.raise_for_status()
86
total = len(results["data"])
87
if total == 0:
88
print('No matches found. Please try another query.')
89
return
90
loop = "next" in results
91
# countBefore = len(papers)
92
# papers.extend([d['citingPaper'] for d in results['data'] if 'externalIDs' in d['citingPaper']
93
# and 'DOI' in d['citingPaper']['externalIds']]
94
# and d['citingPaper']['citationCount'] >= options.minCitations)
95
newPapers = [d['citingPaper'] for d in results['data']
96
if d['citingPaper']['externalIds'] is not None and
97
'DOI' in d['citingPaper']['externalIds'] and
98
d['citingPaper']['citationCount'] is not None and
99
d['citingPaper']['citationCount'] >= options.minCitations and
100
d['citingPaper']['externalIds']['DOI'] not in foundDOI]
101
papers.extend(newPapers)
102
foundDOI.extend([paper['externalIds']['DOI'] for paper in newPapers])
103
# countAfterwards = len(papers)
104
# print("Added %d papers to %d previously found ones from %d found entries"
105
# % ((countAfterwards - countBefore), countBefore, len(results["data"])))
106
if loop:
107
offset += total
108
# wait a little
109
deltaTime = (callEnd - callStart).total_seconds()
110
if deltaTime < delay:
111
time.sleep(delay - deltaTime)
112
113
if len(papers) == 0:
114
print('No matches found. Please try another query.')
115
return
116
117
# exclude certain media
118
if options.excludeMedia is not None:
119
toRemove = []
120
for paper in papers:
121
if paper["venue"] is not None and paper["venue"] in options.excludeMedia:
122
toRemove.append(paper)
123
for r in toRemove:
124
papers.remove(r)
125
print("%d remaining papers after removal of named publications" % len(papers))
126
127
if options.statistics:
128
# compute statistics based on the search result
129
from summa import keywords
130
keywordsTotal = {}
131
for paper in papers:
132
if paper["abstract"] is None:
133
continue
134
abstractKeywords = keywords.keywords(paper["abstract"], scores=True)[:5]
135
for abstractKeyword, score in abstractKeywords:
136
if abstractKeyword not in keywordsTotal:
137
keywordsTotal[abstractKeyword] = 0
138
keywordsTotal[abstractKeyword] += score
139
140
# sorted list of most used discovered keywords
141
maxKeywords = 30
142
# see https://www.analyticsvidhya.com/blog/2022/01/four-of-the-easiest-and-most-effective-methods-of-keyword-extraction-from-a-single-text-using-python/ # noqa
143
sortedKeywords = sorted([(keyword, score) for keyword, score in keywordsTotal.items()],
144
key=lambda x: x[1], reverse=True)[:maxKeywords]
145
print("\n\n%d first automatically discovered keywords:\n" % min(maxKeywords, len(sortedKeywords)))
146
i = 1
147
for keyword, score in sortedKeywords:
148
print("\t%-3d %-100s %-3f" % (i, keyword, score))
149
i += 1
150
151
# insert entries from Semantic Scholar search result
152
lowerToOriginalKeyWords = {keyword.lower(): keyword for keyword in options.keywords} if options.keywords else {}
153
154
blocks = []
155
now = datetime.now()
156
dateString = now.strftime("%Y%m%d")
157
baseGroup = "SemanticScholar_%s" % (dateString)
158
alreadyUsedKeywords = set()
159
groups = [(1, baseGroup, 0, 0)]
160
usedKeys = []
161
newEntries = []
162
for paper in papers:
163
keywords = getRelevantKeyWords(paper, lowerToOriginalKeyWords) if options.keywords else []
164
if len(keywords) > 0:
165
extension = [(2, keyword, 0, 0) for keyword in keywords if keyword not in alreadyUsedKeywords]
166
if len(extension) > 0:
167
groups.extend([(2, keyword, 0, 0) for keyword in keywords if keyword not in alreadyUsedKeywords])
168
alreadyUsedKeywords.update(keywords)
169
keywords.append(baseGroup)
170
newBlock = toBibTexBlock(paper, groups=keywords)
171
generatedKey = newBlock.key
172
i = 0
173
while newBlock.key in usedKeys:
174
newBlock.key = "%s_%d" % (generatedKey, i)
175
i += 1
176
newEntries.append(newBlock)
177
usedKeys.append(newBlock.key)
178
179
# sort new entries by year and citation keys
180
newEntries.sort(key=lambda x: (x.get("year").value, x.key))
181
182
# write JabRef specific comments
183
# gather old groups from Jabref comment, check if some coincide with the new groups
184
jabrefGroupHeader = "jabref-meta: grouping:\n0 AllEntriesGroup:;"
185
if options.bibtexOutput is not None and options.bibtexInput is not None: # combine two libraries
186
print("Merge search results with old library %s" % options.bibtexInput)
187
oldLibrary = bibtexparser.parse_file(options.bibtexInput)
188
for comment in oldLibrary.comments:
189
if comment.comment.startswith("jabref-meta: grouping:"):
190
knownGroupNames = [group[2] for group in groups]
191
foundGroups = re.findall(r"([0-9]+)\sStaticGroup:([^\\;]+)\\;([0-9]+)\\;([0-9]+)", comment.comment)
192
for foundGroup in foundGroups:
193
indent = int(foundGroup[0])
194
name = foundGroup[1]
195
nr1 = foundGroup[2]
196
nr2 = foundGroup[3]
197
if name not in knownGroupNames:
198
groups.append((indent, name, nr1, nr2))
199
break
200
blocks.extend(oldLibrary.entries)
201
# remove double entries present in the old library and the search results
202
oldDOI = [block.get("DOI").value for block in blocks if block.get("DOI") is not None]
203
newEntries = [newEntry for newEntry in newEntries if newEntry.get("DOI").value not in oldDOI]
204
205
blocks.extend(newEntries)
206
jabRefComment = bibtexparser.model.ExplicitComment("jabref-meta: databaseType:bibtex;")
207
blocks.append(jabRefComment)
208
groupText = "%s\n%s" % (jabrefGroupHeader, "\n".join(["%d StaticGroup:%s\\;%s\\;%s\\;0x8a8a8aff\\;\\;\\;;" % (
209
indent, keyword, nr1, nr2) for indent, keyword, nr1, nr2 in groups]))
210
groupComment = bibtexparser.model.ExplicitComment(groupText)
211
blocks.append(groupComment)
212
213
newLibrary = bibtexparser.library.Library(blocks)
214
if options.bibtexOutput is not None:
215
bibText = bibtexparser.write_string(newLibrary)
216
with open(options.bibtexOutput, mode="w", encoding="UTF-8") as f:
217
print("Write to %s" % options.bibtexOutput)
218
f.write(bibText)
219
220
221
def formatName(name):
222
return u"".join([unicodedata.normalize("NFKD", x)[0] for x in name if x not in ("'", "‐")])
223
224
225
def getRelevantKeyWords(paper, keywords):
226
result = []
227
lowerTitle = paper["title"].lower()
228
result.extend([originalKeyword for lowerKeyword, originalKeyword in keywords.items() if lowerKeyword in lowerTitle])
229
return result
230
231
232
def getBibtexType(semanticScholarType):
233
global ArticleTypeMap
234
if semanticScholarType is None or semanticScholarType[0] not in ArticleTypeMap:
235
return "Article"
236
return ArticleTypeMap[semanticScholarType[0]]
237
238
239
def toBibTexBlock(paper, groups=""):
240
# generate citation key
241
authors = [a["name"].strip() for a in paper["authors"]]
242
lastNames = [a[a.rindex(" ")+1:] for a in [formatName(author) for author in authors]]
243
citKey = "%s%d" % ("%sEtAl" % lastNames[0] if len(lastNames) > 2 else "And".join(lastNames), paper["year"])
244
entry = bibtexparser.model.Entry(getBibtexType(paper["publicationTypes"]), citKey, [])
245
entry.set_field(bibtexparser.model.Field("author", " and ".join(authors)))
246
entry.set_field(bibtexparser.model.Field("year", paper["year"]))
247
entry.set_field(bibtexparser.model.Field("title", paper["title"]))
248
if entry.entry_type == "Article" and paper["venue"] is not None:
249
entry.set_field(bibtexparser.model.Field("journal", paper["venue"]))
250
if paper["journal"] is not None:
251
for field in ("pages", "volume"):
252
if field in paper["journal"]:
253
entry.set_field(bibtexparser.model.Field(field, paper["journal"][field]))
254
entry.set_field(bibtexparser.model.Field("DOI", paper['externalIds']['DOI']))
255
if len(groups) > 0:
256
entry.set_field(bibtexparser.model.Field("groups", ",".join(groups)))
257
return entry
258
259
260
def getOptions(args=None):
261
ap = argparse.ArgumentParser(
262
prog='LiteratureCrawler',
263
description='Search in literature databases and collect results in bibtex')
264
ap.add_argument("--bibtex-input", dest="bibtexInput", type=str)
265
ap.add_argument("--bibtex-output", dest="bibtexOutput", type=str)
266
ap.add_argument("--exclude-media", dest="excludeMedia", nargs="+",
267
type=str, help="Exclude results from the given media")
268
ap.add_argument("--cited-works", dest="citedWorks", nargs="+", type=str,
269
help="Give works by their identifiers for which the citing papers should be searched "
270
"(SemanticScholar hash confirmed, DOI 'DOI:' don't work although mentioned in the API doc)")
271
ap.add_argument("--min-citations", dest="minCitations", type=int, default=10,
272
help="Minimum citation count of a paper to be included in the output")
273
ap.add_argument("--keywords", nargs="+", type=str, help="Keywords to group the results in bibtex")
274
ap.add_argument("--statistics", default=False, action="store_true",
275
help="Whether to print statistics about the search result")
276
options = ap.parse_args(args=args)
277
278
return options
279
280
281
if __name__ == "__main__":
282
main(getOptions())
283
284