summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobin Krahl <me@robin-krahl.de>2018-03-24 18:00:27 +0100
committerRobin Krahl <me@robin-krahl.de>2018-03-24 18:00:27 +0100
commit6384d1815135649af4fa647f3205c16ed7dce95a (patch)
tree62276edc4b661c0357facccbe0c543e62ae3c621
parent54efba8e3b8fb20e506a31b3b9e452e1eccffe34 (diff)
downloadbibtool-6384d1815135649af4fa647f3205c16ed7dce95a.tar.gz
bibtool-6384d1815135649af4fa647f3205c16ed7dce95a.tar.bz2
Add extract command to extract the DOI
-rw-r--r--bibtool/cli.py18
-rw-r--r--bibtool/extract.py45
2 files changed, 40 insertions, 23 deletions
diff --git a/bibtool/cli.py b/bibtool/cli.py
index 6fe0890..63e3d5b 100644
--- a/bibtool/cli.py
+++ b/bibtool/cli.py
@@ -19,13 +19,29 @@ def cli():
pass
+@cli.command('extract')
+@click.argument('filename', type=TYPE_FILE)
+@click.option('--all/--single', '-a/-s', 'print_all', default=False)
+def _extract(filename, print_all):
+ for doi in bibtool.extract.get_doi_generator(filename):
+ print(doi)
+ if not print_all:
+ return
+
+
@cli.command('import')
@click.argument('filename', type=TYPE_FILE)
@click.option('--directory', type=TYPE_DIR, default=os.getcwd())
@click.option('--delete/--no-delete', default=False)
@click.option('--doi', type=str, default=None)
def _import(filename, directory, delete, doi):
- bibtex_data = bibtool.extract.get_bibtex_data(filename, doi)
+ doi_generator = bibtool.extract.get_doi_generator(filename)
+ try:
+ doi = next(doi_generator)
+ except StopIteration:
+ raise Exception('Could not extract a DOI from {}'.format(filename))
+
+ bibtex_data = bibtool.extract.get_bibtex_data(doi)
if not bibtex_data.entries:
raise Exception('Did not find any Bibtex entry.')
diff --git a/bibtool/extract.py b/bibtool/extract.py
index cefbe8f..5793da6 100644
--- a/bibtool/extract.py
+++ b/bibtool/extract.py
@@ -13,11 +13,19 @@ import PyPDF2.pdf
PDF_INFO_DOI = '/doi'
-def _extract_doi(filename):
+def _unique(iterable):
+ elems = set()
+ for value in iterable:
+ if value not in elems:
+ elems.add(value)
+ yield value
+
+
+def _get_doi_generator_pdf(filename):
with open(filename, 'rb') as f:
reader = PyPDF2.pdf.PdfFileReader(f)
if PDF_INFO_DOI in reader.documentInfo:
- return reader.documentInfo[PDF_INFO_DOI]
+ yield reader.documentInfo[PDF_INFO_DOI]
with open(filename, 'rb') as f:
pdf = pdftotext.PDF(f)
@@ -28,32 +36,25 @@ def _extract_doi(filename):
pattern = re.compile(r'\b(10\.\d{4,}(?:\.\d+)*/(?:(?!["&\'<>])\S)+)\b')
m = pattern.search(text)
if m:
- return m.group(1)
+ yield m.group(1)
+
+
+def get_doi_generator(filename):
+ mtype, _ = mimetypes.guess_type(filename)
+ generator = None
+ if mtype == 'application/pdf':
+ generator = _get_doi_generator_pdf(filename)
- return None
+ if not generator:
+ raise Exception('Unsupported file type ' + str(mtype))
+ return _unique(generator)
-def _get_data_for_doi(doi):
+
+def get_bibtex_data(doi):
url = 'http://dx.doi.org/{}'.format(doi)
headers = {'Accept': 'text/bibliography; style=bibtex'}
request = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(request) as response:
bibtex = response.read().decode('utf-8')
return bibtexparser.loads(bibtex)
-
-
-def _handle_pdf(filename, doi):
- if not doi:
- doi = _extract_doi(filename)
-
- if not doi:
- raise Exception('Could not extract a DOI from the PDF file')
-
- return _get_data_for_doi(doi)
-
-
-def get_bibtex_data(filename, doi):
- mtype, _ = mimetypes.guess_type(filename)
- if mtype == 'application/pdf':
- return _handle_pdf(filename, doi)
- raise Exception('Unsupported file type ' + str(mtype))