diff options
author | Robin Krahl <me@robin-krahl.de> | 2018-03-09 19:26:54 +0100 |
---|---|---|
committer | Robin Krahl <me@robin-krahl.de> | 2018-03-09 19:26:54 +0100 |
commit | 64037f9a53928ef3fb8b468d7abc2d15ecd1a28c (patch) | |
tree | 83723ae7abf80d2f2eeb1e94913cae407f0bc41d | |
parent | 8d014787c2cce2b6f4afdaa344bb1763b239ffa5 (diff) | |
download | bibtool-64037f9a53928ef3fb8b468d7abc2d15ecd1a28c.tar.gz bibtool-64037f9a53928ef3fb8b468d7abc2d15ecd1a28c.tar.bz2 |
Extract DOI from PDF text using a regex
-rw-r--r-- | bibtool/extract.py | 31 | ||||
-rw-r--r-- | setup.py | 2 |
2 files changed, 27 insertions, 6 deletions
diff --git a/bibtool/extract.py b/bibtool/extract.py index 6409f1b..6955dbe 100644 --- a/bibtool/extract.py +++ b/bibtool/extract.py @@ -2,15 +2,37 @@ # SPDX-License-Identifier: MIT import mimetypes +import re import urllib.request import bibtexparser +import pdftotext import PyPDF2.pdf PDF_INFO_DOI = '/doi' +def _extract_doi(filename): + with open(filename, 'rb') as f: + reader = PyPDF2.pdf.PdfFileReader(f) + if PDF_INFO_DOI in reader.documentInfo: + return reader.documentInfo[PDF_INFO_DOI] + + with open(filename, 'rb') as f: + pdf = pdftotext.PDF(f) + + if len(pdf): + text = pdf[0] + + pattern = re.compile(r'\b(10\.\d{4,}(?:\.\d+)*/(?:(?!["&\'<>])\S)+)\b') + m = pattern.search(text) + if m: + return m.group(1) + + return None + + def _get_data_for_doi(doi): url = 'http://dx.doi.org/{}'.format(doi) headers = {'Accept': 'text/bibliography; style=bibtex'} @@ -21,11 +43,10 @@ def _get_data_for_doi(doi): def _handle_pdf(filename): - with open(filename, 'rb') as f: - reader = PyPDF2.pdf.PdfFileReader(f) - if PDF_INFO_DOI not in reader.documentInfo: - raise Exception('PDF file does not have doi header') - doi = reader.documentInfo[PDF_INFO_DOI] + doi = _extract_doi(filename) + + if not doi: + raise Exception('Could not extract a DOI from the PDF file') return _get_data_for_doi(doi) @@ -17,7 +17,7 @@ setuptools.setup( 'Programming Language :: Python :: 3', ], packages=['bibtool'], - install_requires=['bibtexparser', 'click', 'PyPDF2'], + install_requires=['bibtexparser', 'click', 'pdftotext', 'PyPDF2'], extras_require={ 'checkstyle': ['flake8'], }, |