-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreferred_suborg_names.py
114 lines (95 loc) · 3.47 KB
/
preferred_suborg_names.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""
Utility for creating preferred names from spreadsheet
for wos:SubOrganizations in rap.
"""
import argparse
import sys
import csv
from collections import defaultdict
from rdflib import Literal, Graph
from namespaces import D, WOS, RDF, RDFS, VIVO, rq_prefixes
from lib import backend
from settings import (
logger,
ADDRESS_GRAPH,
SUBORG_GRAPH,
CLEAN_SUBORG_GRAPH
)
from publications import slug_uri
def get_existing_address(uri):
vstore = backend.get_store()
rq = rq_prefixes + """
SELECT ?address
WHERE {
?uri vivo:relatedBy ?address.
?address a wos:Address .
}
"""
rmg = Graph()
addr_uris = []
for row in vstore.query(rq, initBindings={'uri': uri}):
addr_uris.append(row.address)
rmg.add((row.address, VIVO.relates, uri))
rmg.add((uri, VIVO.relatedBy, row.address))
return addr_uris, rmg
def index_orgs(name_key):
uri_pubs = defaultdict(list)
idx = defaultdict(lambda: defaultdict(dict))
vstore = backend.get_store()
rmg = Graph()
addg = Graph()
q = rq_prefixes + """
SELECT DISTINCT ?org ?name
WHERE {
?org a wos:SubOrganization ;
rdfs:label ?label ;
wos:subOrganizationName ?name ;
vivo:relatedBy ?address .
?address a wos:Address ;
vivo:relates ?pub, d:org-technical-university-of-denmark .
}
"""
for row in vstore.query(q):
#print row.org, row.name
existing_name = row.name
pname = name_key.get(existing_name.toPython())
if pname is not None:
logger.info("Processing existing name {} to clean name {}.".format(existing_name, pname))
addr_uris, to_remove = get_existing_address(row.org)
rmg += to_remove
new_uri = slug_uri(pname, prefix="dtusuborg")
addg.add((new_uri, RDF.type, WOS.SubOrganization))
addg.add((new_uri, RDFS.label, Literal(pname)))
addg.add((new_uri, WOS.subOrganizationName, Literal(pname)))
addg.add((new_uri, WOS.subOrganizationNameVariant, existing_name))
for auri in addr_uris:
addg.add((auri, VIVO.relates, new_uri))
return addg, rmg
def process(clean_file, dry=False):
vstore = backend.get_store()
name_key = dict()
# Map clean names
with open(clean_file) as inf:
inf.next()
for n, row in enumerate(csv.reader(inf)):
pname, ename = row[0].strip(), row[1].strip()
if ename == "":
continue
logger.info("Cleaning {} with variant {}.".format(pname, ename))
# Always create variant for the preferred name too.
name_key[pname] = pname
name_key[ename] = pname
addg, removeg = index_orgs(name_key)
graphs = [ADDRESS_GRAPH, SUBORG_GRAPH]
# Remove from these graphs
for g in graphs:
logger.info("Removing preferred name triples with {} triples from {} graph.".format(len(removeg), g))
rm2 = vstore.bulk_remove(g, removeg)
logger.info("Adding preferred name triples with {} triples.".format(len(addg)))
add = vstore.bulk_add(, addg)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Preferred names')
parser.add_argument('--dry-run', '-d', action="store_true", dest="dry", default=False, help="Dry run.")
parser.add_argument('--suborg-names', '-s', required=True, type=str)
args = parser.parse_args()
done = process(args.suborg_names, dry=args.dry)