From 6384d1815135649af4fa647f3205c16ed7dce95a Mon Sep 17 00:00:00 2001 From: Robin Krahl Date: Sat, 24 Mar 2018 18:00:27 +0100 Subject: Add extract command to extract the DOI --- bibtool/cli.py | 18 +++++++++++++++++- bibtool/extract.py | 45 +++++++++++++++++++++++---------------------- 2 files changed, 40 insertions(+), 23 deletions(-) (limited to 'bibtool') diff --git a/bibtool/cli.py b/bibtool/cli.py index 6fe0890..63e3d5b 100644 --- a/bibtool/cli.py +++ b/bibtool/cli.py @@ -19,13 +19,29 @@ def cli(): pass +@cli.command('extract') +@click.argument('filename', type=TYPE_FILE) +@click.option('--all/--single', '-a/-s', 'print_all', default=False) +def _extract(filename, print_all): + for doi in bibtool.extract.get_doi_generator(filename): + print(doi) + if not print_all: + return + + @cli.command('import') @click.argument('filename', type=TYPE_FILE) @click.option('--directory', type=TYPE_DIR, default=os.getcwd()) @click.option('--delete/--no-delete', default=False) @click.option('--doi', type=str, default=None) def _import(filename, directory, delete, doi): - bibtex_data = bibtool.extract.get_bibtex_data(filename, doi) + doi_generator = bibtool.extract.get_doi_generator(filename) + try: + doi = next(doi_generator) + except StopIteration: + raise Exception('Could not extract a DOI from {}'.format(filename)) + + bibtex_data = bibtool.extract.get_bibtex_data(doi) if not bibtex_data.entries: raise Exception('Did not find any Bibtex entry.') diff --git a/bibtool/extract.py b/bibtool/extract.py index cefbe8f..5793da6 100644 --- a/bibtool/extract.py +++ b/bibtool/extract.py @@ -13,11 +13,19 @@ import PyPDF2.pdf PDF_INFO_DOI = '/doi' -def _extract_doi(filename): +def _unique(iterable): + elems = set() + for value in iterable: + if value not in elems: + elems.add(value) + yield value + + +def _get_doi_generator_pdf(filename): with open(filename, 'rb') as f: reader = PyPDF2.pdf.PdfFileReader(f) if PDF_INFO_DOI in reader.documentInfo: - return reader.documentInfo[PDF_INFO_DOI] + yield reader.documentInfo[PDF_INFO_DOI] with open(filename, 'rb') as f: pdf = pdftotext.PDF(f) @@ -28,32 +36,25 @@ def _extract_doi(filename): pattern = re.compile(r'\b(10\.\d{4,}(?:\.\d+)*/(?:(?!["&\'<>])\S)+)\b') m = pattern.search(text) if m: - return m.group(1) + yield m.group(1) + + +def get_doi_generator(filename): + mtype, _ = mimetypes.guess_type(filename) + generator = None + if mtype == 'application/pdf': + generator = _get_doi_generator_pdf(filename) - return None + if not generator: + raise Exception('Unsupported file type ' + str(mtype)) + return _unique(generator) -def _get_data_for_doi(doi): + +def get_bibtex_data(doi): url = 'http://dx.doi.org/{}'.format(doi) headers = {'Accept': 'text/bibliography; style=bibtex'} request = urllib.request.Request(url, headers=headers) with urllib.request.urlopen(request) as response: bibtex = response.read().decode('utf-8') return bibtexparser.loads(bibtex) - - -def _handle_pdf(filename, doi): - if not doi: - doi = _extract_doi(filename) - - if not doi: - raise Exception('Could not extract a DOI from the PDF file') - - return _get_data_for_doi(doi) - - -def get_bibtex_data(filename, doi): - mtype, _ = mimetypes.guess_type(filename) - if mtype == 'application/pdf': - return _handle_pdf(filename, doi) - raise Exception('Unsupported file type ' + str(mtype)) -- cgit v1.2.1