# Copyright (C) 2018 Robin Krahl # SPDX-License-Identifier: MIT import mimetypes import re import urllib.request import bibtexparser import pdftotext import PyPDF2.pdf PDF_INFO_DOI = '/doi' def _extract_doi(filename): with open(filename, 'rb') as f: reader = PyPDF2.pdf.PdfFileReader(f) if PDF_INFO_DOI in reader.documentInfo: return reader.documentInfo[PDF_INFO_DOI] with open(filename, 'rb') as f: pdf = pdftotext.PDF(f) if len(pdf): text = pdf[0] pattern = re.compile(r'\b(10\.\d{4,}(?:\.\d+)*/(?:(?!["&\'<>])\S)+)\b') m = pattern.search(text) if m: return m.group(1) return None def _get_data_for_doi(doi): url = 'http://dx.doi.org/{}'.format(doi) headers = {'Accept': 'text/bibliography; style=bibtex'} request = urllib.request.Request(url, headers=headers) with urllib.request.urlopen(request) as response: bibtex = response.read().decode('utf-8') return bibtexparser.loads(bibtex) def _handle_pdf(filename): doi = _extract_doi(filename) if not doi: raise Exception('Could not extract a DOI from the PDF file') return _get_data_for_doi(doi) def get_bibtex_data(filename): mtype, _ = mimetypes.guess_type(filename) if mtype == 'application/pdf': return _handle_pdf(filename) raise Exception('Unsupported file type ' + str(mtype))