Skip to content

Commit 97b5d18

Browse files
initial
1 parent 9f56d7f commit 97b5d18

1 file changed

Lines changed: 165 additions & 0 deletions

File tree

src/getCkan.py

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
from ckanapi import RemoteCKAN
2+
import urllib
3+
import easygui as gui
4+
import sys
5+
import os
6+
7+
# sourcs: name, download base, access url
8+
sources = [
9+
(u"berlin","berlin.de","https://datenregister.berlin.de/"),\
10+
(u"karlsruhe","karlsruhe.de","https://transparenz.karlsruhe.de/"),\
11+
(u"bahn","deutschebahn.com","https://data.deutschebahn.com/"),\
12+
(u"offeneDaten","offenedaten.de/","https://offenedaten.de/"),\
13+
(u"muenchen","opengov-muenchen.de","https://www.opengov-muenchen.de"),\
14+
(u"meerbusch","meerbusch.de","https://opendata.meerbusch.de/"),\
15+
(u"bonn","bonn.de","https://opendata.bonn.de/"),\
16+
(u"jena","jena.de","https://opendata.jena.de/"),\
17+
(u"africa","africaopendata.org","https://africaopendata.org/")\
18+
]
19+
20+
# file types we want to download
21+
downs = (".csv",".json",".geojson",".gpx",".kmz",".xls",".xlx",".ods",".xlsx",".pdf")
22+
download = True # Download file or not
23+
excludext = True # exclude external resources or not.
24+
25+
26+
#########
27+
def config():
28+
""" Ask user which file types to download"""
29+
choices = []
30+
for s in sources:
31+
choices.append(s[0])
32+
33+
choice = gui.choicebox("Select CKAN origin",\
34+
"Source select",\
35+
choices)
36+
if choice != None:
37+
for c in range(len(sources)):
38+
if choice == sources[c][0]:
39+
choice = c
40+
print("Selected: ",choice)
41+
break
42+
return choice
43+
44+
#########
45+
46+
def loadUrl(p,u):
47+
""" load file from url, print 404 error or raise"""
48+
f = filedir + "/" + p + "_" + u[u.rfind("/")+1:]
49+
50+
try:
51+
urllib.request.urlretrieve(u,f)
52+
except urllib.error.HTTPError as err:
53+
if err.code == 404:
54+
print("URL not found: ",u)
55+
else:
56+
raise
57+
58+
#########
59+
60+
# prepare ...
61+
agent = 'ckanapi/3.0 (+http://digital-codes.de)'
62+
sel = config()
63+
64+
if sel == None:
65+
sys.exit(0)
66+
67+
filedir = "files-" + sources[sel][0]
68+
if not os.path.exists(filedir):
69+
os.makedirs(filedir)
70+
71+
downloadBase = sources[sel][1]
72+
ckanGet = RemoteCKAN(sources[sel][2],user_agent=agent)
73+
74+
# access
75+
grps = ckanGet.action.group_list()
76+
print("Groups:\n",grps)
77+
##for g in grps:
78+
## print("Group: ",ckanGet.action.group_show(id=g))
79+
80+
pkgs = ckanGet.action.package_list()
81+
print("Packages:\n",pkgs)
82+
83+
# reset our list of resource files
84+
items = []
85+
86+
# iterate over packages and resources
87+
for p in pkgs:
88+
try:
89+
pk = ckanGet.action.package_show(id=p)
90+
except ckanapi.errors.NotFound:
91+
print("Package not found: ",p)
92+
continue
93+
94+
print("################")
95+
print("\n\nPackage ",p)
96+
gp = pk.get("groups")
97+
gpname = ""
98+
99+
if gp != None:
100+
for g in gp:
101+
gpname = g.get("name")
102+
if gpname is None:
103+
gpname = g.get("title")
104+
if gpname is None:
105+
gpname = "no name"
106+
107+
print("Group: ",gpname)
108+
109+
#print("\nKeys in package ",p,": ",pk.keys())
110+
#for k in pk.keys():
111+
# print(k,": ",pk.get(k))
112+
#print("\nTitle: ",pk.get("title"),", Notes: ",pk.get("notes"))
113+
114+
u = pk.get("url")
115+
if None != u and u != "":
116+
print("Url:", u)
117+
118+
r = pk.get("resources")
119+
if None != r:
120+
print("#########")
121+
for rr in r:
122+
#print("\nKeys in resource: ",rr.keys())
123+
ru = rr.get("url")
124+
print("\nResource: ",ru)
125+
# check and skip external urls
126+
try:
127+
if excludext and ru.find(downloadBase) < 0:
128+
print("External url: ",ru)
129+
continue
130+
131+
if None != ru:
132+
ri = []
133+
for x in (p,gpname,pk.get("title"),pk.get("license_id"),pk.get("notes"),rr.get("name"),rr.get("description"),rr.get("last_modified"),ru):
134+
if type(x) == str:
135+
x = "\"" + x + "\""
136+
else:
137+
x = ""
138+
ri.append(x.encode('utf-8').strip())
139+
items.append(ri)
140+
rf,re = os.path.splitext(ru)
141+
#print("File: ",rf, ": ", re)
142+
143+
if download and re.lower() in downs:
144+
loadUrl(p,ru)
145+
146+
147+
except urllib.error.URLError:
148+
print("url error")
149+
pass
150+
151+
152+
# write resource description to csv
153+
itemfile = open("items-"+sources[sel][0]+".csv", 'w')
154+
# for some reason, we need a string join here ..
155+
fieldnames = (u"\"package\"",u"\"group\"",u"\"title\"",u"\"license_id\"",u"\"notes\"",u"\"name\"",\
156+
u"\"description\"",u"\"last_modified\"",u"\"url\"")
157+
itemfile.write(u",".join(fieldnames)+u"\n")
158+
159+
# and a byte join here
160+
for i in items:
161+
itemfile.write((b",".join(i)+b"\n").decode("utf-8"))
162+
163+
itemfile.close()
164+
165+

0 commit comments

Comments
 (0)