CoCalc -- bibtexUpdate.py

GitHub Repository: eclipse/sumo
Path: blob/main/tools/build_config/bibtexUpdate.py
¹⁹³⁹⁰⁴ views
1
#!/usr/bin/env python
2
# Eclipse SUMO, Simulation of Urban MObility; see https://eclipse.dev/sumo
3
# Copyright (C) 2008-2026 German Aerospace Center (DLR) and others.
4
# This program and the accompanying materials are made available under the
5
# terms of the Eclipse Public License 2.0 which is available at
6
# https://www.eclipse.org/legal/epl-2.0/
7
# This Source Code may also be made available under the following Secondary
8
# Licenses when the conditions for such availability set forth in the Eclipse
9
# Public License 2.0 are satisfied: GNU General Public License, version 2
10
# or later which is available at
11
# https://www.gnu.org/licenses/old-licenses/gpl-2.0-standalone.html
12
# SPDX-License-Identifier: EPL-2.0 OR GPL-2.0-or-later
13

14
# @file    bibtexUpdate.py
15
# @author  Mirko Barthauer
16
# @date    2024-10-29
17

18
"""
19
This script queries meta data from Semantic Scholar about publications citing
20
one of the papers given by their unique (Semantic Scholar) identifier.
21
Then it writes the results to a text file in bibtex syntax. Optionally it can
22
read an existing bibtex file and integrate existing entries with the new ones,
23
keeping the old annotations / Jabref comments and adding new ones for the new
24
entries. It uses the JabRef group syntax version 3.
25
Moreover it can run a keyword discover algorithm to display the most used keywords in the papers' abstracts.
26
"""
27

28

29
# https://api.semanticscholar.org/graph/v1/paper/{paper_id}/citations /paper_id can be
30
# "DOI:10.18653/v1/N18-3011", Semantic Scholar hash "649def34f8be52c8b66281af98ae884c09aef38b" and others)
31

32
import os
33
import re
34
import time
35
import argparse
36
import requests
37
import bibtexparser
38
import unicodedata
39
from datetime import datetime
40

41
ArticleTypeMap = {
42
    "JournalArticle":  "Article",
43
    "Conference":  "InProceedings",
44
    "Dataset":  "Misc",
45
    "Book":  "Book",
46
    "BookSection":  "InBook"
47
}
48

49

50
def main(options):
51
    if options.bibtexInput is None or not os.path.exists(options.bibtexInput):
52
        print("The search results will not be merged with existing bibtex because no bibtex input file was found.")
53

54
    offsetLimit = 1000  # maximum number of entries the citation request of Semantic Scholar can return
55
    papers = []
56
    citationRequestTemplate = "https://api.semanticscholar.org/graph/v1/paper/%s/citations%s"
57
    delay = 1  # s
58
    foundDOI = []
59

60
    # loop through original works the result papers should cite
61
    for originalDOI in options.citedWorks:
62
        # send requests with the maximum offset but keep a delay between the requests
63
        loop = True
64
        offset = 0
65
        while loop:
66
            # remember call duration to send a limited amout of requests
67
            callStart = datetime.now()
68
            params = {
69
                'offset': str(offset),
70
                'limit': str(offsetLimit),
71
                'fields': 'title,url,authors,year,externalIds,citationCount,abstract,venue,journal,publicationTypes',
72
            }
73
            fullRequest = citationRequestTemplate % (originalDOI, "" if len(params) == 0 else "?%s" % "&".join([
74
                                                     "%s=%s" % (key, val) for key, val in params.items()]))
75
            print(fullRequest)
76
            rsp = requests.get(fullRequest)
77
            callEnd = datetime.now()
78
            results = rsp.json()
79
            if rsp.status_code == 400:
80
                requestError = results["error"]
81
                print(requestError)
82
            elif rsp.status_code == 404:
83
                print("The original paper with the identifier %s is not available at SemanticScholar." % originalDOI)
84
                break
85
            rsp.raise_for_status()
86
            total = len(results["data"])
87
            if total == 0:
88
                print('No matches found. Please try another query.')
89
                return
90
            loop = "next" in results
91
            # countBefore = len(papers)
92
            # papers.extend([d['citingPaper'] for d in results['data'] if 'externalIDs' in d['citingPaper']
93
            #  and 'DOI' in d['citingPaper']['externalIds']]
94
            #  and d['citingPaper']['citationCount'] >= options.minCitations)
95
            newPapers = [d['citingPaper'] for d in results['data']
96
                         if d['citingPaper']['externalIds'] is not None and
97
                         'DOI' in d['citingPaper']['externalIds'] and
98
                         d['citingPaper']['citationCount'] is not None and
99
                         d['citingPaper']['citationCount'] >= options.minCitations and
100
                         d['citingPaper']['externalIds']['DOI'] not in foundDOI]
101
            papers.extend(newPapers)
102
            foundDOI.extend([paper['externalIds']['DOI'] for paper in newPapers])
103
            # countAfterwards = len(papers)
104
            # print("Added %d papers to %d previously found ones from %d found entries"
105
            #  % ((countAfterwards - countBefore), countBefore, len(results["data"])))
106
            if loop:
107
                offset += total
108
                # wait a little
109
                deltaTime = (callEnd - callStart).total_seconds()
110
                if deltaTime < delay:
111
                    time.sleep(delay - deltaTime)
112

113
    if len(papers) == 0:
114
        print('No matches found. Please try another query.')
115
        return
116

117
    # exclude certain media
118
    if options.excludeMedia is not None:
119
        toRemove = []
120
        for paper in papers:
121
            if paper["venue"] is not None and paper["venue"] in options.excludeMedia:
122
                toRemove.append(paper)
123
        for r in toRemove:
124
            papers.remove(r)
125
        print("%d remaining papers after removal of named publications" % len(papers))
126

127
    if options.statistics:
128
        # compute statistics based on the search result
129
        from summa import keywords
130
        keywordsTotal = {}
131
        for paper in papers:
132
            if paper["abstract"] is None:
133
                continue
134
            abstractKeywords = keywords.keywords(paper["abstract"], scores=True)[:5]
135
            for abstractKeyword, score in abstractKeywords:
136
                if abstractKeyword not in keywordsTotal:
137
                    keywordsTotal[abstractKeyword] = 0
138
                keywordsTotal[abstractKeyword] += score
139

140
        # sorted list of most used discovered keywords
141
        maxKeywords = 30
142
        # see https://www.analyticsvidhya.com/blog/2022/01/four-of-the-easiest-and-most-effective-methods-of-keyword-extraction-from-a-single-text-using-python/  # noqa
143
        sortedKeywords = sorted([(keyword, score) for keyword, score in keywordsTotal.items()],
144
                                key=lambda x: x[1], reverse=True)[:maxKeywords]
145
        print("\n\n%d first automatically discovered keywords:\n" % min(maxKeywords, len(sortedKeywords)))
146
        i = 1
147
        for keyword, score in sortedKeywords:
148
            print("\t%-3d   %-100s   %-3f" % (i, keyword, score))
149
            i += 1
150

151
    # insert entries from Semantic Scholar search result
152
    lowerToOriginalKeyWords = {keyword.lower(): keyword for keyword in options.keywords} if options.keywords else {}
153

154
    blocks = []
155
    now = datetime.now()
156
    dateString = now.strftime("%Y%m%d")
157
    baseGroup = "SemanticScholar_%s" % (dateString)
158
    alreadyUsedKeywords = set()
159
    groups = [(1, baseGroup, 0, 0)]
160
    usedKeys = []
161
    newEntries = []
162
    for paper in papers:
163
        keywords = getRelevantKeyWords(paper, lowerToOriginalKeyWords) if options.keywords else []
164
        if len(keywords) > 0:
165
            extension = [(2, keyword, 0, 0) for keyword in keywords if keyword not in alreadyUsedKeywords]
166
            if len(extension) > 0:
167
                groups.extend([(2, keyword, 0, 0) for keyword in keywords if keyword not in alreadyUsedKeywords])
168
            alreadyUsedKeywords.update(keywords)
169
            keywords.append(baseGroup)
170
        newBlock = toBibTexBlock(paper, groups=keywords)
171
        generatedKey = newBlock.key
172
        i = 0
173
        while newBlock.key in usedKeys:
174
            newBlock.key = "%s_%d" % (generatedKey, i)
175
            i += 1
176
        newEntries.append(newBlock)
177
        usedKeys.append(newBlock.key)
178

179
    # sort new entries by year and citation keys
180
    newEntries.sort(key=lambda x: (x.get("year").value, x.key))
181

182
    # write JabRef specific comments
183
    # gather old groups from Jabref comment, check if some coincide with the new groups
184
    jabrefGroupHeader = "jabref-meta: grouping:\n0 AllEntriesGroup:;"
185
    if options.bibtexOutput is not None and options.bibtexInput is not None:  # combine two libraries
186
        print("Merge search results with old library %s" % options.bibtexInput)
187
        oldLibrary = bibtexparser.parse_file(options.bibtexInput)
188
        for comment in oldLibrary.comments:
189
            if comment.comment.startswith("jabref-meta: grouping:"):
190
                knownGroupNames = [group[2] for group in groups]
191
                foundGroups = re.findall(r"([0-9]+)\sStaticGroup:([^\\;]+)\\;([0-9]+)\\;([0-9]+)", comment.comment)
192
                for foundGroup in foundGroups:
193
                    indent = int(foundGroup[0])
194
                    name = foundGroup[1]
195
                    nr1 = foundGroup[2]
196
                    nr2 = foundGroup[3]
197
                    if name not in knownGroupNames:
198
                        groups.append((indent, name, nr1, nr2))
199
                break
200
        blocks.extend(oldLibrary.entries)
201
        # remove double entries present in the old library and the search results
202
        oldDOI = [block.get("DOI").value for block in blocks if block.get("DOI") is not None]
203
        newEntries = [newEntry for newEntry in newEntries if newEntry.get("DOI").value not in oldDOI]
204

205
    blocks.extend(newEntries)
206
    jabRefComment = bibtexparser.model.ExplicitComment("jabref-meta: databaseType:bibtex;")
207
    blocks.append(jabRefComment)
208
    groupText = "%s\n%s" % (jabrefGroupHeader, "\n".join(["%d StaticGroup:%s\\;%s\\;%s\\;0x8a8a8aff\\;\\;\\;;" % (
209
        indent, keyword, nr1, nr2) for indent, keyword, nr1, nr2 in groups]))
210
    groupComment = bibtexparser.model.ExplicitComment(groupText)
211
    blocks.append(groupComment)
212

213
    newLibrary = bibtexparser.library.Library(blocks)
214
    if options.bibtexOutput is not None:
215
        bibText = bibtexparser.write_string(newLibrary)
216
        with open(options.bibtexOutput, mode="w", encoding="UTF-8") as f:
217
            print("Write to %s" % options.bibtexOutput)
218
            f.write(bibText)
219

220

221
def formatName(name):
222
    return u"".join([unicodedata.normalize("NFKD", x)[0] for x in name if x not in ("'", "‐")])
223

224

225
def getRelevantKeyWords(paper, keywords):
226
    result = []
227
    lowerTitle = paper["title"].lower()
228
    result.extend([originalKeyword for lowerKeyword, originalKeyword in keywords.items() if lowerKeyword in lowerTitle])
229
    return result
230

231

232
def getBibtexType(semanticScholarType):
233
    global ArticleTypeMap
234
    if semanticScholarType is None or semanticScholarType[0] not in ArticleTypeMap:
235
        return "Article"
236
    return ArticleTypeMap[semanticScholarType[0]]
237

238

239
def toBibTexBlock(paper, groups=""):
240
    # generate citation key
241
    authors = [a["name"].strip() for a in paper["authors"]]
242
    lastNames = [a[a.rindex(" ")+1:] for a in [formatName(author) for author in authors]]
243
    citKey = "%s%d" % ("%sEtAl" % lastNames[0] if len(lastNames) > 2 else "And".join(lastNames), paper["year"])
244
    entry = bibtexparser.model.Entry(getBibtexType(paper["publicationTypes"]), citKey, [])
245
    entry.set_field(bibtexparser.model.Field("author", " and ".join(authors)))
246
    entry.set_field(bibtexparser.model.Field("year", paper["year"]))
247
    entry.set_field(bibtexparser.model.Field("title", paper["title"]))
248
    if entry.entry_type == "Article" and paper["venue"] is not None:
249
        entry.set_field(bibtexparser.model.Field("journal", paper["venue"]))
250
        if paper["journal"] is not None:
251
            for field in ("pages", "volume"):
252
                if field in paper["journal"]:
253
                    entry.set_field(bibtexparser.model.Field(field, paper["journal"][field]))
254
    entry.set_field(bibtexparser.model.Field("DOI", paper['externalIds']['DOI']))
255
    if len(groups) > 0:
256
        entry.set_field(bibtexparser.model.Field("groups", ",".join(groups)))
257
    return entry
258

259

260
def getOptions(args=None):
261
    ap = argparse.ArgumentParser(
262
        prog='LiteratureCrawler',
263
        description='Search in literature databases and collect results in bibtex')
264
    ap.add_argument("--bibtex-input", dest="bibtexInput", type=str)
265
    ap.add_argument("--bibtex-output", dest="bibtexOutput", type=str)
266
    ap.add_argument("--exclude-media", dest="excludeMedia", nargs="+",
267
                    type=str, help="Exclude results from the given media")
268
    ap.add_argument("--cited-works", dest="citedWorks", nargs="+", type=str,
269
                    help="Give works by their identifiers for which the citing papers should be searched "
270
                    "(SemanticScholar hash confirmed, DOI 'DOI:' don't work although mentioned in the API doc)")
271
    ap.add_argument("--min-citations", dest="minCitations", type=int, default=10,
272
                    help="Minimum citation count of a paper to be included in the output")
273
    ap.add_argument("--keywords", nargs="+", type=str, help="Keywords to group the results in bibtex")
274
    ap.add_argument("--statistics", default=False, action="store_true",
275
                    help="Whether to print statistics about the search result")
276
    options = ap.parse_args(args=args)
277

278
    return options
279

280

281
if __name__ == "__main__":
282
    main(getOptions())
283

284
Product

Resources

Company