# exec(open("Documents/mathias/zenod/test dataset to zenodo.py").read())
import requests
import pandas as pd
import json
import os
import datetime
import re
import sys
def upload_to_sandbox(filename,ucount,insts,access_token,logfile,quicklog,i) :
#filename is a string and should be a barcode.jpg
#ucount is an integer: the point where the loop this function will run in ends, so a superfluous comma can be avoided in the json log
#insts is a data frame linking institution name to herbarium codes, for the metadata
#access_token can be obtained from a Zenodo account
#logfile is a file where the json response will be dumped
#quicklog is a tsv .txt file listing the status codes and indicating whether a file could be found or not
quicklog.write('%s\t' % filename.replace('.jpg',''))
#some generic input values for the requests
headers = {"Content-Type": "application/json"}
headers2 = {"Accept":"application/json",
"Authorization": "Bearer %s" % access_token,
"Content-Type": "application/octet-stream"}
#text strings to fill if segmented images or tif images are present
#Any formatting needs to be done in HTML
segst = ''
tifst = ''
##
#1) Make an empty upload
r = requests.post('https://zenodo.org/api/deposit/depositions',
params = {'access_token': access_token},json = {},
headers = headers)
#Log empty deposition response in logfile
empty_upload_json = r.json()
logfile.write(json.dumps(empty_upload_json, indent=4, sort_keys=True))
logfile.write(",")
quicklog.write('%s\t' % r.status_code)
#Retrieve deposition id and bucket_url for other requests
bucket_url = r.json()['links']['bucket']
depo_id = r.json()['id']
##
#2a) Upload the image jpg
#derive the filename and the file location
fileloc = '/all 1800 jpg/%s' % filename
#Make the put request
r = requests.put('%s/%s' % (bucket_url,filename),
data = open(fileloc,'rb'),
headers = headers2)
#Log image upload response in logfile
empty_upload_json = r.json()
logfile.write(json.dumps(empty_upload_json, indent=4, sort_keys=True))
logfile.write(",")
quicklog.write('%s\t' % r.status_code)
##
#2b) Upload the image tif, if any
#derive the filename and the file location
filename = filename.replace('.jpg','.tif')
fileloc = '/tif_storage/all/%s' % filename
#Make the put request
if os.path.exists(fileloc):
r = requests.put('%s/%s' % (bucket_url,filename),
data = open(fileloc,'rb'),
headers = headers2)
#Log image upload response in logfile
empty_upload_json = r.json()
logfile.write(json.dumps(empty_upload_json, indent=4, sort_keys=True))
logfile.write(",")
quicklog.write('%s\t' % r.status_code)
tifst = '
- A lossless TIFF image from which the JPEG image has been derived.'
else:
quicklog.write('NONE\t')
##
#2c) Upload the json data file
#derive the filename and the file location
fileloc = '/alljsons/%s.json' % filename.replace('.tif','')
filename = filename.replace('.tif','.json')
#Maje the put request
r = requests.put('%s/%s' % (bucket_url,filename),
data = open(fileloc,'rb'),
headers = headers2)
#Log data upload response in logfile
empty_upload_json = r.json()
logfile.write(json.dumps(empty_upload_json, indent=4, sort_keys=True))
logfile.write(",")
quicklog.write('%s\t' % r.status_code)
##
#2d) Segmentation PNG files, if any
#derive the filename and the file location
filename = filename.replace('.json','_all.png')
seg = '/seg/%s' % filename
#Make the put requests and log the upload responses if the files exist
if os.path.exists(seg):
r = requests.put('%s/%s' % (bucket_url,filename),
data = open(seg,'rb'),
headers = headers2)
empty_upload_json = r.json()
logfile.write(json.dumps(empty_upload_json, indent=4, sort_keys=True))
logfile.write(",")
quicklog.write('%s\t' % r.status_code)
filename = filename.replace('_all','_sel')
seg = '/seg/%s' % filename
r = requests.put('%s/%s' % (bucket_url,filename),
data = open(seg,'rb'),
headers = headers2)
empty_upload_json = r.json()
logfile.write(json.dumps(empty_upload_json, indent=4, sort_keys=True))
logfile.write(",")
quicklog.write('%s\t' % r.status_code)
segst = '
- Two PNG files containing segmented image overlays of the scanned herbarium sheet. The _all extension indicates that all labels, color charts and pieces of text\
have received a different color against a black background color. The _sel extension indicates that these elements are white if they\'re barcode labels, yellow\
if they\'re color charts and red if they\'re anything else.'
else:
quicklog.write('NONE\tNONE\t')
##
#3) Set up zenodo metadata
#Read the data file (windows encoding)
fileread = json.load(open(fileloc,'r',encoding="iso-8859-15"))
#Modify the file so the data fields can be added as subjects
#Replace the namespace with entire url
fileread2 = {k.replace('dcterms:','http://purl.org/dc/terms/')\
.replace('dwc:','http://rs.tdwg.org/dwc/terms/')\
.replace('dc:','http://purl.org/dc/elements/1.1/'): v for k,v in fileread['@graph'][0].items() }
#Remove custom terms
fileread2 = {k:v for k,v in fileread2.items() if re.match('http://',k)}
#Convert integers to strings and set up structure required for Zenodo
fileread2 = {k:str(v) for k,v in fileread2.items()}
fileread2 = [{'term':v,'identifier':k} for k,v in fileread2.items()]
#Formulate the data fields
mdata = {
'metadata': {
'title': 'Herbarium specimen image of %s, part of the collection of %s' % (fileread['@graph'][0]['dwc:scientificName'],
insts['institution'][list(insts['setID']).index(fileread['@graph'][0]['setID'])]),
'upload_type': 'image',
'image_type': 'photo',
'description': 'Part of a training dataset of scanned herbarium specimens. The data paper and a summary landing page will be published on Zenodo \
as it gets published.
Content of this deposition:
- A JSON-LD datafile listing the label data associated with this herbarium specimen. The Darwin and \
Dublin Core data standards are used for most values.
- A JPEG image file of the scanned herbarium sheet.%s%s' % (tifst,segst),
'creators': [{'name': '%s' % insts['institution'][list(insts['setID']).index(fileread['@graph'][0]['setID'])]}],
'grants': [{'id':'777483'}],
'language': fileread['@graph'][0]['dcterms:language'],
'related_identifiers': [{'identifier': fileread['@graph'][0]['@id'],'relation': 'isAlternateIdentifier'}],
'keywords': ['biodiversity','herbarium sheet'],
'subjects': fileread2,
'communities': [{'identifier':'icedigtest'}]
}
}
#Upload zenodo metadata
r = requests.put('https://zenodo.org/api/deposit/depositions/%s' % depo_id,
params = {'access_token': access_token}, data = json.dumps(mdata),
headers = headers)
#Log metadata upload reponse in logfile
empty_upload_json = r.json()
logfile.write(json.dumps(empty_upload_json, indent=4, sort_keys=True))
quicklog.write('%s\t' % r.status_code)
#Don't add a comma if at end of the loop
if i != ucount:
logfile.write(",")
#publish:
#!!add another header to quicklog if you uncomment this
#r = requests.post('https://zenodo.org/api/deposit/depositions/%s/actions/publish' % depo_ids[i],
#params={'access_token': access_token} )
#empty_upload_json = r.json()
#logfile.write(json.dumps(empty_upload_json, indent=4, sort_keys=True))
#quicklog.write('%s\t' % r.status_code)
quicklog.write('\n')
#Get a list of all deposition filenames, i.e. the barcodes (with jpg extension in this case)
files = os.listdir("/all 1800 jpg/")
#Link institution names to the custom variable setID, so they can be inserted into the metadata fields. The metadata originating (mostly) from GBIF is not sufficiently consistent for this.
insts = pd.read_csv('inst.csv',sep=';')
#My access token
access_token = 'token here'
#Open a log file set for appending
#To make this into proper json, it needs to be formatted as an array with square brackets and comma's need to be added between each json data dump
logfile=open("superlog.txt","a+")
logfile.write("[")
#Starting and ending point for upload
scount = 0
ucount = 1799
#Also make a quick log for a quick overview of json status codes and coding fails
quicklog=open("quicklog.txt","a+")
quicklog.write('Barcode\tUpload\tJPG\tTIF\tJSON\tSegAll\tSegSel\tData\n')
print(str(datetime.datetime.now()))
for i in range(scount, ucount+1):
try:
upload_to_sandbox(files[i],ucount,insts,access_token,logfile,quicklog,i)
except:
quicklog.write('%s\n' % sys.exc_info()[0])
print(str(datetime.datetime.now()))
logfile.write("]")
logfile.close()
quicklog.close()