diff options
Diffstat (limited to 'bibtool/extract.py')
-rw-r--r-- | bibtool/extract.py | 37 |
1 files changed, 37 insertions, 0 deletions
diff --git a/bibtool/extract.py b/bibtool/extract.py new file mode 100644 index 0000000..6409f1b --- /dev/null +++ b/bibtool/extract.py @@ -0,0 +1,37 @@ +# Copyright (C) 2018 Robin Krahl <robin.krahl@ireas.org> +# SPDX-License-Identifier: MIT + +import mimetypes +import urllib.request + +import bibtexparser +import PyPDF2.pdf + + +PDF_INFO_DOI = '/doi' + + +def _get_data_for_doi(doi): + url = 'http://dx.doi.org/{}'.format(doi) + headers = {'Accept': 'text/bibliography; style=bibtex'} + request = urllib.request.Request(url, headers=headers) + with urllib.request.urlopen(request) as response: + bibtex = response.read().decode('utf-8') + return bibtexparser.loads(bibtex) + + +def _handle_pdf(filename): + with open(filename, 'rb') as f: + reader = PyPDF2.pdf.PdfFileReader(f) + if PDF_INFO_DOI not in reader.documentInfo: + raise Exception('PDF file does not have doi header') + doi = reader.documentInfo[PDF_INFO_DOI] + + return _get_data_for_doi(doi) + + +def get_bibtex_data(filename): + mtype, _ = mimetypes.guess_type(filename) + if mtype == 'application/pdf': + return _handle_pdf(filename) + raise Exception('Unsupported file type ' + str(mtype)) |