summaryrefslogtreecommitdiff
path: root/bibtool/extract.py
diff options
context:
space:
mode:
authorRobin Krahl <robin.krahl@ireas.org>2018-03-08 21:07:16 +0100
committerRobin Krahl <robin.krahl@ireas.org>2018-03-08 21:07:16 +0100
commit4a0cdafd28b4288b8f9fae9db84b0ebd8dfba8a8 (patch)
treec26494770276bd907c32d49e4aadbbf73d256377 /bibtool/extract.py
parent134d48399ac537d4841575ee0363a309a3461e23 (diff)
downloadbibtool-4a0cdafd28b4288b8f9fae9db84b0ebd8dfba8a8.tar.gz
bibtool-4a0cdafd28b4288b8f9fae9db84b0ebd8dfba8a8.tar.bz2
Add basic import implementation
Diffstat (limited to 'bibtool/extract.py')
-rw-r--r--bibtool/extract.py37
1 files changed, 37 insertions, 0 deletions
diff --git a/bibtool/extract.py b/bibtool/extract.py
new file mode 100644
index 0000000..6409f1b
--- /dev/null
+++ b/bibtool/extract.py
@@ -0,0 +1,37 @@
+# Copyright (C) 2018 Robin Krahl <robin.krahl@ireas.org>
+# SPDX-License-Identifier: MIT
+
+import mimetypes
+import urllib.request
+
+import bibtexparser
+import PyPDF2.pdf
+
+
+PDF_INFO_DOI = '/doi'
+
+
+def _get_data_for_doi(doi):
+ url = 'http://dx.doi.org/{}'.format(doi)
+ headers = {'Accept': 'text/bibliography; style=bibtex'}
+ request = urllib.request.Request(url, headers=headers)
+ with urllib.request.urlopen(request) as response:
+ bibtex = response.read().decode('utf-8')
+ return bibtexparser.loads(bibtex)
+
+
+def _handle_pdf(filename):
+ with open(filename, 'rb') as f:
+ reader = PyPDF2.pdf.PdfFileReader(f)
+ if PDF_INFO_DOI not in reader.documentInfo:
+ raise Exception('PDF file does not have doi header')
+ doi = reader.documentInfo[PDF_INFO_DOI]
+
+ return _get_data_for_doi(doi)
+
+
+def get_bibtex_data(filename):
+ mtype, _ = mimetypes.guess_type(filename)
+ if mtype == 'application/pdf':
+ return _handle_pdf(filename)
+ raise Exception('Unsupported file type ' + str(mtype))