From 6384d1815135649af4fa647f3205c16ed7dce95a Mon Sep 17 00:00:00 2001 From: Robin Krahl Date: Sat, 24 Mar 2018 18:00:27 +0100 Subject: Add extract command to extract the DOI --- bibtool/extract.py | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'bibtool/extract.py') diff --git a/bibtool/extract.py b/bibtool/extract.py index cefbe8f..5793da6 100644 --- a/bibtool/extract.py +++ b/bibtool/extract.py @@ -13,11 +13,19 @@ import PyPDF2.pdf PDF_INFO_DOI = '/doi' -def _extract_doi(filename): +def _unique(iterable): + elems = set() + for value in iterable: + if value not in elems: + elems.add(value) + yield value + + +def _get_doi_generator_pdf(filename): with open(filename, 'rb') as f: reader = PyPDF2.pdf.PdfFileReader(f) if PDF_INFO_DOI in reader.documentInfo: - return reader.documentInfo[PDF_INFO_DOI] + yield reader.documentInfo[PDF_INFO_DOI] with open(filename, 'rb') as f: pdf = pdftotext.PDF(f) @@ -28,32 +36,25 @@ def _extract_doi(filename): pattern = re.compile(r'\b(10\.\d{4,}(?:\.\d+)*/(?:(?!["&\'<>])\S)+)\b') m = pattern.search(text) if m: - return m.group(1) + yield m.group(1) + + +def get_doi_generator(filename): + mtype, _ = mimetypes.guess_type(filename) + generator = None + if mtype == 'application/pdf': + generator = _get_doi_generator_pdf(filename) - return None + if not generator: + raise Exception('Unsupported file type ' + str(mtype)) + return _unique(generator) -def _get_data_for_doi(doi): + +def get_bibtex_data(doi): url = 'http://dx.doi.org/{}'.format(doi) headers = {'Accept': 'text/bibliography; style=bibtex'} request = urllib.request.Request(url, headers=headers) with urllib.request.urlopen(request) as response: bibtex = response.read().decode('utf-8') return bibtexparser.loads(bibtex) - - -def _handle_pdf(filename, doi): - if not doi: - doi = _extract_doi(filename) - - if not doi: - raise Exception('Could not extract a DOI from the PDF file') - - return _get_data_for_doi(doi) - - -def get_bibtex_data(filename, doi): - mtype, _ = mimetypes.guess_type(filename) - if mtype == 'application/pdf': - return _handle_pdf(filename, doi) - raise Exception('Unsupported file type ' + str(mtype)) -- cgit v1.2.1