# exec(open("Documents/mathias/zenod/test dataset to zenodo.py").read()) import requests import pandas as pd import json import os import datetime import re import sys def upload_to_sandbox(filename,ucount,insts,access_token,logfile,quicklog,i) : #filename is a string and should be a barcode.jpg #ucount is an integer: the point where the loop this function will run in ends, so a superfluous comma can be avoided in the json log #insts is a data frame linking institution name to herbarium codes, for the metadata #access_token can be obtained from a Zenodo account #logfile is a file where the json response will be dumped #quicklog is a tsv .txt file listing the status codes and indicating whether a file could be found or not quicklog.write('%s\t' % filename.replace('.jpg','')) #some generic input values for the requests headers = {"Content-Type": "application/json"} headers2 = {"Accept":"application/json", "Authorization": "Bearer %s" % access_token, "Content-Type": "application/octet-stream"} #text strings to fill if segmented images or tif images are present #Any formatting needs to be done in HTML segst = '' tifst = '' ## #1) Make an empty upload r = requests.post('https://zenodo.org/api/deposit/depositions', params = {'access_token': access_token},json = {}, headers = headers) #Log empty deposition response in logfile empty_upload_json = r.json() logfile.write(json.dumps(empty_upload_json, indent=4, sort_keys=True)) logfile.write(",") quicklog.write('%s\t' % r.status_code) #Retrieve deposition id and bucket_url for other requests bucket_url = r.json()['links']['bucket'] depo_id = r.json()['id'] ## #2a) Upload the image jpg #derive the filename and the file location fileloc = '/all 1800 jpg/%s' % filename #Make the put request r = requests.put('%s/%s' % (bucket_url,filename), data = open(fileloc,'rb'), headers = headers2) #Log image upload response in logfile empty_upload_json = r.json() logfile.write(json.dumps(empty_upload_json, indent=4, sort_keys=True)) logfile.write(",") quicklog.write('%s\t' % r.status_code) ## #2b) Upload the image tif, if any #derive the filename and the file location filename = filename.replace('.jpg','.tif') fileloc = '/tif_storage/all/%s' % filename #Make the put request if os.path.exists(fileloc): r = requests.put('%s/%s' % (bucket_url,filename), data = open(fileloc,'rb'), headers = headers2) #Log image upload response in logfile empty_upload_json = r.json() logfile.write(json.dumps(empty_upload_json, indent=4, sort_keys=True)) logfile.write(",") quicklog.write('%s\t' % r.status_code) tifst = '
- A lossless TIFF image from which the JPEG image has been derived.' else: quicklog.write('NONE\t') ## #2c) Upload the json data file #derive the filename and the file location fileloc = '/alljsons/%s.json' % filename.replace('.tif','') filename = filename.replace('.tif','.json') #Maje the put request r = requests.put('%s/%s' % (bucket_url,filename), data = open(fileloc,'rb'), headers = headers2) #Log data upload response in logfile empty_upload_json = r.json() logfile.write(json.dumps(empty_upload_json, indent=4, sort_keys=True)) logfile.write(",") quicklog.write('%s\t' % r.status_code) ## #2d) Segmentation PNG files, if any #derive the filename and the file location filename = filename.replace('.json','_all.png') seg = '/seg/%s' % filename #Make the put requests and log the upload responses if the files exist if os.path.exists(seg): r = requests.put('%s/%s' % (bucket_url,filename), data = open(seg,'rb'), headers = headers2) empty_upload_json = r.json() logfile.write(json.dumps(empty_upload_json, indent=4, sort_keys=True)) logfile.write(",") quicklog.write('%s\t' % r.status_code) filename = filename.replace('_all','_sel') seg = '/seg/%s' % filename r = requests.put('%s/%s' % (bucket_url,filename), data = open(seg,'rb'), headers = headers2) empty_upload_json = r.json() logfile.write(json.dumps(empty_upload_json, indent=4, sort_keys=True)) logfile.write(",") quicklog.write('%s\t' % r.status_code) segst = '
- Two PNG files containing segmented image overlays of the scanned herbarium sheet. The _all extension indicates that all labels, color charts and pieces of text\ have received a different color against a black background color. The _sel extension indicates that these elements are white if they\'re barcode labels, yellow\ if they\'re color charts and red if they\'re anything else.' else: quicklog.write('NONE\tNONE\t') ## #3) Set up zenodo metadata #Read the data file (windows encoding) fileread = json.load(open(fileloc,'r',encoding="iso-8859-15")) #Modify the file so the data fields can be added as subjects #Replace the namespace with entire url fileread2 = {k.replace('dcterms:','http://purl.org/dc/terms/')\ .replace('dwc:','http://rs.tdwg.org/dwc/terms/')\ .replace('dc:','http://purl.org/dc/elements/1.1/'): v for k,v in fileread['@graph'][0].items() } #Remove custom terms fileread2 = {k:v for k,v in fileread2.items() if re.match('http://',k)} #Convert integers to strings and set up structure required for Zenodo fileread2 = {k:str(v) for k,v in fileread2.items()} fileread2 = [{'term':v,'identifier':k} for k,v in fileread2.items()] #Formulate the data fields mdata = { 'metadata': { 'title': 'Herbarium specimen image of %s, part of the collection of %s' % (fileread['@graph'][0]['dwc:scientificName'], insts['institution'][list(insts['setID']).index(fileread['@graph'][0]['setID'])]), 'upload_type': 'image', 'image_type': 'photo', 'description': 'Part of a training dataset of scanned herbarium specimens. The data paper and a summary landing page will be published on Zenodo \ as it gets published.

Content of this deposition:

- A JSON-LD datafile listing the label data associated with this herbarium specimen. The Darwin and \ Dublin Core data standards are used for most values.
- A JPEG image file of the scanned herbarium sheet.%s%s' % (tifst,segst), 'creators': [{'name': '%s' % insts['institution'][list(insts['setID']).index(fileread['@graph'][0]['setID'])]}], 'grants': [{'id':'777483'}], 'language': fileread['@graph'][0]['dcterms:language'], 'related_identifiers': [{'identifier': fileread['@graph'][0]['@id'],'relation': 'isAlternateIdentifier'}], 'keywords': ['biodiversity','herbarium sheet'], 'subjects': fileread2, 'communities': [{'identifier':'icedigtest'}] } } #Upload zenodo metadata r = requests.put('https://zenodo.org/api/deposit/depositions/%s' % depo_id, params = {'access_token': access_token}, data = json.dumps(mdata), headers = headers) #Log metadata upload reponse in logfile empty_upload_json = r.json() logfile.write(json.dumps(empty_upload_json, indent=4, sort_keys=True)) quicklog.write('%s\t' % r.status_code) #Don't add a comma if at end of the loop if i != ucount: logfile.write(",") #publish: #!!add another header to quicklog if you uncomment this #r = requests.post('https://zenodo.org/api/deposit/depositions/%s/actions/publish' % depo_ids[i], #params={'access_token': access_token} ) #empty_upload_json = r.json() #logfile.write(json.dumps(empty_upload_json, indent=4, sort_keys=True)) #quicklog.write('%s\t' % r.status_code) quicklog.write('\n') #Get a list of all deposition filenames, i.e. the barcodes (with jpg extension in this case) files = os.listdir("/all 1800 jpg/") #Link institution names to the custom variable setID, so they can be inserted into the metadata fields. The metadata originating (mostly) from GBIF is not sufficiently consistent for this. insts = pd.read_csv('inst.csv',sep=';') #My access token access_token = 'token here' #Open a log file set for appending #To make this into proper json, it needs to be formatted as an array with square brackets and comma's need to be added between each json data dump logfile=open("superlog.txt","a+") logfile.write("[") #Starting and ending point for upload scount = 0 ucount = 1799 #Also make a quick log for a quick overview of json status codes and coding fails quicklog=open("quicklog.txt","a+") quicklog.write('Barcode\tUpload\tJPG\tTIF\tJSON\tSegAll\tSegSel\tData\n') print(str(datetime.datetime.now())) for i in range(scount, ucount+1): try: upload_to_sandbox(files[i],ucount,insts,access_token,logfile,quicklog,i) except: quicklog.write('%s\n' % sys.exc_info()[0]) print(str(datetime.datetime.now())) logfile.write("]") logfile.close() quicklog.close()