summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bibtool/extract.py31
-rw-r--r--setup.py2
2 files changed, 27 insertions, 6 deletions
diff --git a/bibtool/extract.py b/bibtool/extract.py
index 6409f1b..6955dbe 100644
--- a/bibtool/extract.py
+++ b/bibtool/extract.py
@@ -2,15 +2,37 @@
# SPDX-License-Identifier: MIT
import mimetypes
+import re
import urllib.request
import bibtexparser
+import pdftotext
import PyPDF2.pdf
PDF_INFO_DOI = '/doi'
+def _extract_doi(filename):
+ with open(filename, 'rb') as f:
+ reader = PyPDF2.pdf.PdfFileReader(f)
+ if PDF_INFO_DOI in reader.documentInfo:
+ return reader.documentInfo[PDF_INFO_DOI]
+
+ with open(filename, 'rb') as f:
+ pdf = pdftotext.PDF(f)
+
+ if len(pdf):
+ text = pdf[0]
+
+ pattern = re.compile(r'\b(10\.\d{4,}(?:\.\d+)*/(?:(?!["&\'<>])\S)+)\b')
+ m = pattern.search(text)
+ if m:
+ return m.group(1)
+
+ return None
+
+
def _get_data_for_doi(doi):
url = 'http://dx.doi.org/{}'.format(doi)
headers = {'Accept': 'text/bibliography; style=bibtex'}
@@ -21,11 +43,10 @@ def _get_data_for_doi(doi):
def _handle_pdf(filename):
- with open(filename, 'rb') as f:
- reader = PyPDF2.pdf.PdfFileReader(f)
- if PDF_INFO_DOI not in reader.documentInfo:
- raise Exception('PDF file does not have doi header')
- doi = reader.documentInfo[PDF_INFO_DOI]
+ doi = _extract_doi(filename)
+
+ if not doi:
+ raise Exception('Could not extract a DOI from the PDF file')
return _get_data_for_doi(doi)
diff --git a/setup.py b/setup.py
index d871b5c..9333697 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@ setuptools.setup(
'Programming Language :: Python :: 3',
],
packages=['bibtool'],
- install_requires=['bibtexparser', 'click', 'PyPDF2'],
+ install_requires=['bibtexparser', 'click', 'pdftotext', 'PyPDF2'],
extras_require={
'checkstyle': ['flake8'],
},