Skip to content

Commit 8720c3a

Browse files
committed
merged develop
2 parents 1607c3b + cbde56f commit 8720c3a

12 files changed

Lines changed: 450 additions & 154 deletions

File tree

LICENSE.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
The MIT License (MIT)
2+
=====================
3+
4+
Copyright © 2019 Nicholas Woodward
5+
6+
Permission is hereby granted, free of charge, to any person
7+
obtaining a copy of this software and associated documentation
8+
files (the “Software”), to deal in the Software without
9+
restriction, including without limitation the rights to use,
10+
copy, modify, merge, publish, distribute, sublicense, and/or sell
11+
copies of the Software, and to permit persons to whom the
12+
Software is furnished to do so, subject to the following
13+
conditions:
14+
15+
The above copyright notice and this permission notice shall be
16+
included in all copies or substantial portions of the Software.
17+
18+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
19+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25+
OTHER DEALINGS IN THE SOFTWARE.

config/application.yml.sample

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ dataverse_db_host: ''
44
dataverse_db_name: ''
55
dataverse_db_username: ''
66
dataverse_db_password: ''
7+
include_dataset_metrics: false
78
work_dir: '/tmp'
89
log_path: 'logs'
910
log_file: 'dataverse-reports.log'

lib/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
__version__ = "1.0.0"

lib/api.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,19 @@ def get_dataverse_contents(self, identifier=''):
6868
response_json = response.json()
6969
return response_json['data']
7070

71+
def get_dataverse_size(self, identifier='', includeCached=False):
72+
if identifier is None:
73+
self.logger.error("Must specify identifer.")
74+
return
75+
76+
url = self.host + 'api/' + self.version + '/dataverses/' + str(identifier) + '/storagesize'
77+
if includeCached is True:
78+
url += '?includeCache=true'
79+
self.logger.debug("Retrieving dataverse storage size: %s", url)
80+
response = requests.get(url, headers=self.headers)
81+
self.logger.debug("Return status: %s", str(response.status_code))
82+
return response
83+
7184
def sword_get_dataverse(self, alias=''):
7285
if alias is None:
7386
self.logger.error("Must specify an alias.")
@@ -92,6 +105,24 @@ def get_dataset(self, identifier=''):
92105
self.logger.debug("Return status: %s", str(response.status_code))
93106
return response
94107

108+
def get_dataset_metric(self, identifier='', option='', doi=''):
109+
if identifier is None or option is None or doi is None:
110+
self.logger.error("Must specify an identifer, option and DOI.")
111+
return
112+
113+
url = self.host + 'api/' + self.version + '/datasets/' + str(identifier) + '/makeDataCount/' + str(option) + '?persistentId=' + doi
114+
self.logger.debug("Retrieving dataset_metric: %s", url)
115+
response = requests.get(url, headers=self.headers)
116+
self.logger.debug("Return status: %s", str(response.status_code))
117+
return response
118+
119+
def get_admin_list_users(self, page=1):
120+
url = self.host + 'api/' + self.version + '/admin/list-users/?selectedPage=' + str(page)
121+
self.logger.debug("Retrieving users list: %s", url)
122+
response = requests.get(url, headers=self.headers)
123+
self.logger.debug("Return status: %s", str(response.status_code))
124+
return response.json()
125+
95126
def construct_parameters(self, params={}):
96127
parameters = ''
97128
first = True

lib/database.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def get_download_count(self, dataset_id=None):
3030
return
3131

3232
cursor = self.conn.cursor()
33-
cursor.execute("SELECT COUNT(id) FROM guestbookresponse WHERE downloadtype = 'Download' AND dataset_id = %s;", [str(dataset_id)])
33+
cursor.execute("SELECT COUNT(g.id) FROM guestbookresponse g LEFT JOIN filedownload f on g.id = f.guestbookresponse_id WHERE g.dataset_id = %s;", [str(dataset_id)])
3434
result = cursor.fetchone()
3535
count = result[0]
3636
return count

lib/output.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def save_report_excel_file(self, output_file_path=None, worksheet_files=[]):
5555
if len(filename_parts) == 2:
5656
workbook_name = filename_parts[1]
5757
else:
58-
workbook_name = ''
58+
workbook_name = filename
5959

6060
worksheet = workbook.add_worksheet(workbook_name)
6161
with open(worksheet_file, 'rt', encoding='utf8') as f:

reports/dataset.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,13 @@ def __init__(self, dataverse_api=None, dataverse_database=None, config=None):
3232

3333
self.logger = logging.getLogger('dataverse-reports')
3434

35-
def report_datasets_recursive(self, account_info):
35+
def report_datasets_recursive(self, dataverse_identifier):
3636
# List of datasets
3737
datasets = []
3838

39-
self.logger.info("Begin loading datasets for %s.", account_info['identifier'])
40-
self.load_datasets_recursive(datasets, account_info['identifier'])
41-
self.logger.info("Finished loading %s datasets for %s", str(len(datasets)), account_info['identifier'])
39+
self.logger.info("Begin loading datasets for %s.", dataverse_identifier)
40+
self.load_datasets_recursive(datasets, dataverse_identifier)
41+
self.logger.info("Finished loading %s datasets for %s", str(len(datasets)), dataverse_identifier)
4242

4343
return datasets
4444

@@ -64,16 +64,17 @@ def load_datasets_recursive(self, datasets={}, dataverse_identifier=None):
6464
if dvObject['type'] == 'dataset':
6565
# Add dataset to this dataverse
6666
self.logger.info("Adding dataset %s to dataverse %s.", str(dvObject['id']), str(dataverse_identifier))
67-
self.add_dataset(datasets, dataverse_identifier, dvObject['id'])
67+
self.add_dataset(datasets, dataverse_identifier, dvObject['id'], dvObject['identifier'])
6868
if dvObject['type'] == 'dataverse':
6969
self.logger.info("Found new dataverse %s.", str(dvObject['id']))
7070
self.load_datasets_recursive(datasets, dvObject['id'])
7171
else:
7272
self.logger.warn("Dataverse was empty.")
7373

74-
def add_dataset(self, datasets, dataverse_identifier, dataset_id):
74+
def add_dataset(self, datasets, dataverse_identifier, dataset_id, dataset_identifier):
7575
# Load dataset
7676
self.logger.info("Dataset id: %s", dataset_id)
77+
self.logger.info("Dataset identifier: %s", dataset_identifier)
7778
dataset_response = self.dataverse_api.get_dataset(identifier=dataset_id)
7879
response_json = dataset_response.json()
7980
if 'data' in response_json:
@@ -103,23 +104,43 @@ def add_dataset(self, datasets, dataverse_identifier, dataset_id):
103104
# Remove nested information
104105
dataset.pop('latestVersion')
105106

107+
if (self.config['include_dataset_metrics']):
108+
# Use Make Data Count endpoints to gather views and downloads statistics
109+
dataset_metrics_options = ['viewsUnique', 'viewsTotal', 'downloadsUnique', 'downloadsTotal']
110+
for dataset_metrics_option in dataset_metrics_options:
111+
self.logger.debug("Calling endpoint for dataset metric: " + dataset_metrics_option)
112+
dataset_metrics_response = self.dataverse_api.get_dataset_metric(identifier=dataset_id,option=dataset_metrics_option,doi=dataset_identifier)
113+
dataset_metrics_json = dataset_metrics_response.json()
114+
if dataset_metrics_json['status'] == 'OK' and dataset_metrics_option in dataset_metrics_json['data']:
115+
self.logger.info("MDC metric (" + dataset_metrics_option + "): " + str(dataset_metrics_json['data'][dataset_metrics_option]))
116+
dataset[dataset_metrics_option] = dataset_metrics_json['data'][dataset_metrics_option]
117+
else:
118+
self.logger.debug("Call was unsuccessfull.")
119+
dataset[dataset_metrics_option] = 0
120+
106121
# Use dataverse_database to retrieve cumulative download count of file in this dataset
107122
download_count = self.dataverse_database.get_download_count(dataset_id=dataset_id)
108123
self.logger.info("Download count for dataset: %s", str(download_count))
109124
dataset['downloadCount'] = download_count
110125

111126
if 'files' in dataset:
112127
contentSize = 0
128+
count_restricted = 0
113129
files = dataset['files']
114130
for file in files:
115131
if 'dataFile' in file:
132+
if file['restricted']:
133+
count_restricted += 1
116134
dataFile = file['dataFile']
117135
filesize = int(dataFile['filesize'])
118136
contentSize += filesize
119137
self.logger.info('Totel size (bytes) of all files in this dataset: %s', str(contentSize))
120138
# Convert to megabytes for reports
121139
dataset['contentSize (MB)'] = (contentSize/1048576)
122140

141+
dataset['totalFiles'] = len(files)
142+
dataset['totalRestrictedFiles'] = count_restricted
143+
123144
# Retrieve dataverse to get alias
124145
dataverse_response = self.dataverse_api.get_dataverse(identifier=dataverse_identifier)
125146
response_json = dataverse_response.json()

reports/dataverse.py

Lines changed: 74 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,15 @@
22
import sys
33
import csv
44
import pprint
5+
import re
56
import smtplib
67
import mimetypes
78
import logging
89
from email.mime.multipart import MIMEMultipart
910
from email.mime.text import MIMEText
1011

12+
from .user import UserReports
13+
1114
class DataverseReports(object):
1215
def __init__(self, dataverse_api=None, config=None):
1316
if dataverse_api is None:
@@ -19,6 +22,12 @@ def __init__(self, dataverse_api=None, config=None):
1922
return
2023

2124
self.dataverse_api = dataverse_api
25+
self.config = config
26+
self.dataverse_size_pattern = re.compile('dataverse:\s(.*)\sbyte')
27+
self.logger = logging.getLogger('dataverse-reports')
28+
29+
# Create UserReports object to retrieve user metadata
30+
self.user_reports = UserReports(dataverse_api=dataverse_api, config=config)
2231

2332
# Ensure trailing slash on work_dir
2433
if config['work_dir'][len(config['work_dir'])-1] != '/':
@@ -28,15 +37,12 @@ def __init__(self, dataverse_api=None, config=None):
2837
self.ns = {'atom': 'http://www.w3.org/2005/Atom',
2938
'sword': 'http://purl.org/net/sword/terms/state'}
3039

31-
self.config = config
32-
self.logger = logging.getLogger('dataverse-reports')
33-
34-
def report_dataverses_recursive(self, account_info):
40+
def report_dataverses_recursive(self, dataverse_identifier):
3541
# List of dataverses
3642
dataverses = []
3743

3844
# Load dataverses
39-
self.load_dataverses_recursive(dataverses, account_info['identifier'])
45+
self.load_dataverses_recursive(dataverses, dataverse_identifier)
4046

4147
return dataverses
4248

@@ -64,21 +70,71 @@ def load_dataverse(self, dataverses, dataverse_identifier):
6470

6571
self.logger.info("Dataverse name: %s", dataverse['name'])
6672

67-
# Flatten the nested creator information
68-
if 'creator' in dataverse:
73+
# Flatten the nested contact information
74+
if 'dataverseContacts' in dataverse:
75+
dataverseContacts = dataverse['dataverseContacts']
76+
if len(dataverseContacts) > 0:
77+
self.logger.debug("The dataverseContacts list contains " + str(len(dataverseContacts)) + " contacts.")
78+
dataverseContact = dataverseContacts[0]
79+
if 'contactEmail' in dataverseContact:
80+
contactEmail = dataverseContact['contactEmail'].strip()
81+
self.logger.debug("Found email of dataverse contact: %s", str(contactEmail))
82+
user = self.user_reports.find_user_email(contactEmail)
83+
if bool(user):
84+
self.logger.debug("Adding contact information: %s", user)
85+
if 'userIdentifier' in user:
86+
dataverse['contactIdentifier'] = user['userIdentifier']
87+
if 'firstName' in user:
88+
dataverse['contactFirstName'] = user['firstName']
89+
if 'lastName' in user:
90+
dataverse['contactLastName'] = user['lastName']
91+
if 'email' in user:
92+
dataverse['contactEmail'] = user['email']
93+
if 'affiliation' in user:
94+
dataverse['contactAffiliation'] = user['affiliation']
95+
if 'roles' in user:
96+
dataverse['contactRoles'] = user['roles']
97+
else:
98+
self.logger.warn("Unable to find user from dataverseContact email: " + contactEmail)
99+
dataverse['contactEmail'] = contactEmail
100+
else:
101+
self.logger.warn("First dataverseContact doesn't have an email.")
102+
else:
103+
self.logger.warn("List of dataverseContacts is empty.")
104+
elif 'creator' in dataverse: # Legacy field in older Dataverse versions
69105
self.logger.debug("Replacing creator array.")
70106
creator = dataverse['creator']
71107
if 'identifier' in creator:
72-
dataverse['creatorIdentifier'] = creator['identifier']
108+
dataverse['contactIdentifier'] = creator['identifier']
73109
if 'displayName' in creator:
74-
dataverse['creatorName'] = creator['displayName']
110+
dataverse['contactName'] = creator['displayName']
75111
if 'email' in creator:
76-
dataverse['creatorEmail'] = creator['email']
112+
dataverse['contactEmail'] = creator['email']
77113
if 'affiliation' in creator:
78-
dataverse['creatorAffiliation'] = creator['affiliation']
114+
dataverse['contactAffiliation'] = creator['affiliation']
79115
if 'position' in creator:
80-
dataverse['creatorPosition'] = creator['position']
116+
dataverse['contactPosition'] = creator['position']
81117
dataverse.pop('creator')
118+
else:
119+
self.logger.warn("Unable to find dataverse contact information.")
120+
121+
# Add the data (file) size of the dataverse and all its sub-dataverses
122+
dataverse_size_response = self.dataverse_api.get_dataverse_size(identifier=dataverse_identifier, includeCached=True)
123+
response_size_json = dataverse_size_response.json()
124+
if response_size_json['status'] == 'OK' and 'data' in response_size_json:
125+
dataverse_size = response_size_json['data']
126+
if 'message' in dataverse_size:
127+
size_message = dataverse_size['message']
128+
self.logger.debug("The message element from storagesize endpoint: " + size_message)
129+
size_bytes_match = re.search(self.dataverse_size_pattern, size_message)
130+
if size_bytes_match is not None:
131+
size_bytes_string = size_bytes_match.group(1)
132+
size_bytes = int(size_bytes_string.replace(',',''))
133+
dataverse['contentSize (MB)'] = (size_bytes/1048576)
134+
else:
135+
self.logger.warning("Unable to find the bytes value in the message.")
136+
else:
137+
self.logger.warning("No message element in response from storagesize endpoint.")
82138

83139
# Add the 'dataverseHasBeenReleased' field from the Sword API
84140
if 'alias' in dataverse:
@@ -94,6 +150,12 @@ def load_dataverse(self, dataverses, dataverse_identifier):
94150
else:
95151
self.logger.debug("Element 'dataverseHasBeenReleased' is not present in XML.")
96152

153+
# Load datasets
154+
#dataverse_contents = self.dataverse_api.get_dataverse_contents(identifier=dataverse_identifier)
155+
#for dvObject in dataverse_contents:
156+
#if dvObject['type'] == 'dataset':
157+
#self.load_dataset(dataverse, dvObject['id'])
158+
97159
dataverses.append(dataverse)
98160
else:
99161
self.logger.warn("Dataverse was empty.")

0 commit comments

Comments
 (0)