# Copyright (C) 2018 Robin Krahl # SPDX-License-Identifier: MIT import mimetypes import re import urllib.request import bibtexparser import pdftotext import PyPDF2.pdf PDF_INFO_DOI = '/doi' def _unique(iterable): elems = set() for value in iterable: if value not in elems: elems.add(value) yield value def _get_doi_generator_pdf(filename): with open(filename, 'rb') as f: reader = PyPDF2.pdf.PdfFileReader(f) if PDF_INFO_DOI in reader.documentInfo: yield reader.documentInfo[PDF_INFO_DOI] with open(filename, 'rb') as f: pdf = pdftotext.PDF(f) if len(pdf): text = pdf[0] pattern = re.compile(r'\b(10\.\d{4,}(?:\.\d+)*/(?:(?!["&\'<>])\S)+)\b') m = pattern.search(text) if m: yield m.group(1) def get_doi_generator(filename): mtype, _ = mimetypes.guess_type(filename) generator = None if mtype == 'application/pdf': generator = _get_doi_generator_pdf(filename) if not generator: raise Exception('Unsupported file type ' + str(mtype)) return _unique(generator) def get_bibtex_data(doi): url = 'http://dx.doi.org/{}'.format(doi) headers = {'Accept': 'text/bibliography; style=bibtex'} request = urllib.request.Request(url, headers=headers) with urllib.request.urlopen(request) as response: bibtex = response.read().decode('utf-8') bibtex_data = bibtexparser.loads(bibtex) if not bibtex_data.entries: raise Exception('Did not find any Bibtex entry.') if len(bibtex_data.entries) > 1: raise Exception('Found more than one Bibtex entries.') return bibtex_data