summaryrefslogtreecommitdiff
path: root/bibtool/extract.py
diff options
context:
space:
mode:
Diffstat (limited to 'bibtool/extract.py')
-rw-r--r--bibtool/extract.py45
1 files changed, 23 insertions, 22 deletions
diff --git a/bibtool/extract.py b/bibtool/extract.py
index cefbe8f..5793da6 100644
--- a/bibtool/extract.py
+++ b/bibtool/extract.py
@@ -13,11 +13,19 @@ import PyPDF2.pdf
PDF_INFO_DOI = '/doi'
-def _extract_doi(filename):
+def _unique(iterable):
+ elems = set()
+ for value in iterable:
+ if value not in elems:
+ elems.add(value)
+ yield value
+
+
+def _get_doi_generator_pdf(filename):
with open(filename, 'rb') as f:
reader = PyPDF2.pdf.PdfFileReader(f)
if PDF_INFO_DOI in reader.documentInfo:
- return reader.documentInfo[PDF_INFO_DOI]
+ yield reader.documentInfo[PDF_INFO_DOI]
with open(filename, 'rb') as f:
pdf = pdftotext.PDF(f)
@@ -28,32 +36,25 @@ def _extract_doi(filename):
pattern = re.compile(r'\b(10\.\d{4,}(?:\.\d+)*/(?:(?!["&\'<>])\S)+)\b')
m = pattern.search(text)
if m:
- return m.group(1)
+ yield m.group(1)
+
+
+def get_doi_generator(filename):
+ mtype, _ = mimetypes.guess_type(filename)
+ generator = None
+ if mtype == 'application/pdf':
+ generator = _get_doi_generator_pdf(filename)
- return None
+ if not generator:
+ raise Exception('Unsupported file type ' + str(mtype))
+ return _unique(generator)
-def _get_data_for_doi(doi):
+
+def get_bibtex_data(doi):
url = 'http://dx.doi.org/{}'.format(doi)
headers = {'Accept': 'text/bibliography; style=bibtex'}
request = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(request) as response:
bibtex = response.read().decode('utf-8')
return bibtexparser.loads(bibtex)
-
-
-def _handle_pdf(filename, doi):
- if not doi:
- doi = _extract_doi(filename)
-
- if not doi:
- raise Exception('Could not extract a DOI from the PDF file')
-
- return _get_data_for_doi(doi)
-
-
-def get_bibtex_data(filename, doi):
- mtype, _ = mimetypes.guess_type(filename)
- if mtype == 'application/pdf':
- return _handle_pdf(filename, doi)
- raise Exception('Unsupported file type ' + str(mtype))