Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/biolink/kgx
Browse files Browse the repository at this point in the history
  • Loading branch information
sierra-moxon committed Sep 23, 2021
2 parents f15f944 + c0f8411 commit 420699a
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 47 deletions.
77 changes: 47 additions & 30 deletions kgx/graph_operations/meta_knowledge_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,13 @@ class MetaKnowledgeGraph:
error_log = stderr

def __init__(
self,
name="",
node_facet_properties: Optional[List] = None,
edge_facet_properties: Optional[List] = None,
progress_monitor: Optional[Callable[[GraphEntityType, List], None]] = None,
error_log=None,
**kwargs,
self,
name="",
node_facet_properties: Optional[List] = None,
edge_facet_properties: Optional[List] = None,
progress_monitor: Optional[Callable[[GraphEntityType, List], None]] = None,
error_log=None,
**kwargs,
):
"""
MetaKnowledgeGraph constructor.
Expand Down Expand Up @@ -213,8 +213,8 @@ def __init__(self, category_curie: str, mkg):
Biolink Model category CURIE identifier.
"""
if not (
_category_curie_regexp.fullmatch(category_curie)
or category_curie == "unknown"
_category_curie_regexp.fullmatch(category_curie)
or category_curie == "unknown"
):
raise RuntimeError("Invalid Biolink category CURIE: " + category_curie)

Expand Down Expand Up @@ -280,7 +280,7 @@ def get_count(self) -> int:
return self.category_stats["count"]

def get_count_by_source(
self, facet: str = "provided_by", source: str = None
self, facet: str = "provided_by", source: str = None
) -> Dict[str, Any]:
"""
Parameters
Expand Down Expand Up @@ -469,8 +469,24 @@ def _compile_triple_source_stats(self, triple: Tuple[str, str, str], data: Dict)
data,
)

@staticmethod
def _normalize_relation_field(field) -> Set:
# various non-string iterables...
if isinstance(field, List) or \
isinstance(field, Tuple) or \
isinstance(field, Set):
# eliminate duplicate terms
# and normalize to a set
return set(field)
elif isinstance(field, str):
# for uniformity, we coerce
# to a set of one element
return {field}
else:
raise TypeError(f"Unexpected KGX edge 'relation' data field of type '{type(field)}'")

def _process_triple(
self, subject_category: str, predicate: str, object_category: str, data: Dict
self, subject_category: str, predicate: str, object_category: str, data: Dict
):
# Process the 'valid' S-P-O triple here...
triple = (subject_category, predicate, object_category)
Expand All @@ -484,11 +500,13 @@ def _process_triple(
"count": 0,
}

if (
"relation" in data
and data["relation"] not in self.association_map[triple]["relations"]
):
self.association_map[triple]["relations"].add(data["relation"])
# patch for observed defect in some ETL's such as the July 2021 SRI Reference graph
# in which the relation field ends up being a list of terms, sometimes duplicated

if "relation" in data:
# input data["relation"] is normalized to a Set here
data["relation"] = self._normalize_relation_field(data["relation"])
self.association_map[triple]["relations"].update(data["relation"])

self.association_map[triple]["count"] += 1

Expand Down Expand Up @@ -545,7 +563,6 @@ def analyse_edge(self, u, v, k, data) -> None:
return

for obj_cat_idx in self.node_catalog[v]:

object_category: str = self.Category.get_category_curie_from_index(
obj_cat_idx
)
Expand Down Expand Up @@ -733,12 +750,12 @@ def get_total_edge_counts_across_mappings(self) -> int:
return count

def get_edge_count_by_source(
self,
subject_category: str,
predicate: str,
object_category: str,
facet: str = "knowledge_source",
source: Optional[str] = None,
self,
subject_category: str,
predicate: str,
object_category: str,
facet: str = "knowledge_source",
source: Optional[str] = None,
) -> Dict[str, Any]:
"""
Returns count by source for one S-P-O triple (S, O being Biolink categories; P, a Biolink predicate)
Expand All @@ -751,8 +768,8 @@ def get_edge_count_by_source(
return dict()
triple = (subject_category, predicate, object_category)
if (
triple in self.association_map
and "count_by_source" in self.association_map[triple]
triple in self.association_map
and "count_by_source" in self.association_map[triple]
):
if facet in self.association_map[triple]["count_by_source"]:
if source:
Expand Down Expand Up @@ -902,10 +919,10 @@ def save(self, file, name: str = None, file_format: str = "json") -> None:
yaml.dump(stats, file)


def generate_meta_knowledge_graph(graph: BaseGraph, name: str, filename: str) -> None:
def generate_meta_knowledge_graph(graph: BaseGraph, name: str, filename: str, **kwargs) -> None:
"""
Generate a knowledge map that describes the composition of the graph
and write to ``filename``.
Generate a knowledge map that describes
the composition of the graph and write to ``filename``.
Parameters
----------
Expand All @@ -917,7 +934,7 @@ def generate_meta_knowledge_graph(graph: BaseGraph, name: str, filename: str) ->
The file to write the knowledge map to
"""
graph_stats = summarize_graph(graph, name)
graph_stats = summarize_graph(graph, name, **kwargs)
with open(filename, mode="w") as mkgh:
dump(graph_stats, mkgh, indent=4, default=mkg_default)

Expand All @@ -940,5 +957,5 @@ def summarize_graph(graph: BaseGraph, name: str = None, **kwargs) -> Dict:
Dict
A TRAPI 1.1 compliant meta knowledge graph of the knowledge graph returned as a dictionary.
"""
mkg = MetaKnowledgeGraph(name)
mkg = MetaKnowledgeGraph(name, **kwargs)
return mkg.summarize_graph(graph)
8 changes: 8 additions & 0 deletions tests/resources/complex_graph_edges.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
subject object predicate relation aggregator_knowledge_source
HGNC:10848 NCBIGene:6469 biolink:interacts_with RO:0002434 biogrid
HGNC:10848 HGNC:9398 biolink:interacts_with RO:0002434|RO:0002434|RO:0002434 string
HGNC:10848 HGNC:9399 biolink:interacts_with RO:0002434 string
HGNC:10848 HGNC:16265 biolink:interacts_with RO:0002434 string
HGNC:10848 HGNC:16787 biolink:interacts_with RO:0002434 biogrid
HGNC:10848 GO:0009986 biolink:part_of BFO:0000050 go
HGNC:10848 GO:0097190 biolink:related_to RO:0002331|RO:0002327 go
10 changes: 10 additions & 0 deletions tests/resources/complex_graph_nodes.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
id name category taxon
HGNC:10848 SHH (human) biolink:Gene NCBITaxon:9606
NCBIGene:6469 SHH biolink:Gene NCBITaxon:9606
HGNC:9398 OLIG2 biolink:Gene NCBITaxon:9606
HGNC:9399 PRKCD biolink:Gene NCBITaxon:9606
HGNC:16265 WNT5B biolink:Gene NCBITaxon:9606
HGNC:16466 SUFU biolink:Gene NCBITaxon:9606
HGNC:16787 EDEM3 biolink:Gene NCBITaxon:9606
GO:0009986 cell surface biolink:CellularComponent
GO:0097190 apoptotic signaling pathway biolink:BiologicalProcess
16 changes: 15 additions & 1 deletion tests/unit/test_cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,15 @@ def test_graph_summary1():
os.path.join(RESOURCE_DIR, "graph_edges.tsv"),
]
output = os.path.join(TARGET_DIR, "graph_stats1.yaml")
summary_stats = graph_summary(inputs, "tsv", None, output, report_type="kgx-map")
summary_stats = graph_summary(
inputs,
"tsv",
None,
output,
node_facet_properties=["provided_by"],
edge_facet_properties=["aggregator_knowledge_source"],
report_type="kgx-map"
)

assert os.path.exists(output)
assert summary_stats
Expand Down Expand Up @@ -74,6 +82,8 @@ def test_graph_summary2a():
None,
output,
report_type="meta-knowledge-graph",
node_facet_properties=["provided_by"],
edge_facet_properties=["aggregator_knowledge_source"],
graph_name="Default Meta-Knowledge-Graph",
)

Expand Down Expand Up @@ -101,6 +111,8 @@ def test_graph_summary2b():
None,
output,
report_type="meta-knowledge-graph",
node_facet_properties=["provided_by"],
edge_facet_properties=["aggregator_knowledge_source"],
report_format="yaml",
)

Expand All @@ -126,6 +138,8 @@ def test_graph_summary2c():
input_compression=None,
output=output,
report_type="meta-knowledge-graph",
node_facet_properties=["provided_by"],
edge_facet_properties=["aggregator_knowledge_source"],
stream=True,
)

Expand Down
78 changes: 62 additions & 16 deletions tests/unit/test_meta_knowledge_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,24 @@
from tests import RESOURCE_DIR, TARGET_DIR


def _check_mkg_json_contents(data):
assert "NCBIGene" in data["nodes"]["biolink:Gene"]["id_prefixes"]
assert "REACT" in data["nodes"]["biolink:Pathway"]["id_prefixes"]
assert "HP" in data["nodes"]["biolink:PhenotypicFeature"]["id_prefixes"]
assert data["nodes"]["biolink:Gene"]["count"] == 178
assert len(data["nodes"]) == 8
assert len(data["edges"]) == 13
edge1 = data["edges"][0]
assert edge1["subject"] == "biolink:Gene"
assert edge1["predicate"] == "biolink:interacts_with"
assert edge1["object"] == "biolink:Gene"
assert edge1["count"] == 165
edge1_cbs = edge1["count_by_source"]
assert "aggregator_knowledge_source" in edge1_cbs
edge1_cbs_aks = edge1_cbs["aggregator_knowledge_source"]
assert edge1_cbs_aks["string"] == 159


def test_generate_classical_meta_knowledge_graph():
"""
Test generate meta knowledge graph operation.
Expand All @@ -32,17 +50,15 @@ def test_generate_classical_meta_knowledge_graph():
output_filename = os.path.join(TARGET_DIR, "test_meta_knowledge_graph-1.json")

generate_meta_knowledge_graph(
transformer.store.graph, "Test Graph", output_filename
graph=transformer.store.graph,
name="Test Graph",
filename=output_filename,
edge_facet_properties=["aggregator_knowledge_source"]
)

data = json.load(open(output_filename))
assert data["name"] == "Test Graph"
assert "NCBIGene" in data["nodes"]["biolink:Gene"]["id_prefixes"]
assert "REACT" in data["nodes"]["biolink:Pathway"]["id_prefixes"]
assert "HP" in data["nodes"]["biolink:PhenotypicFeature"]["id_prefixes"]
assert data["nodes"]["biolink:Gene"]["count"] == 178
assert len(data["nodes"]) == 8
assert len(data["edges"]) == 13
_check_mkg_json_contents(data)


def test_generate_meta_knowledge_graph_by_stream_inspector():
Expand All @@ -61,7 +77,8 @@ def test_generate_meta_knowledge_graph_by_stream_inspector():
transformer = Transformer(stream=True)

mkg = MetaKnowledgeGraph(
"Test Graph - Streamed", edge_facet_properties=["aggregator_knowledge_source"]
"Test Graph - Streamed",
edge_facet_properties=["aggregator_knowledge_source"]
)

# We configure the Transformer with a data flow inspector
Expand Down Expand Up @@ -97,6 +114,7 @@ def test_generate_meta_knowledge_graph_by_stream_inspector():
assert len(ecbs1) == 2
assert "biogrid" in ecbs1
assert "string" in ecbs1
assert ecbs1["string"] == 159

ecbs2 = mkg.get_edge_count_by_source(
"biolink:Gene",
Expand All @@ -108,6 +126,7 @@ def test_generate_meta_knowledge_graph_by_stream_inspector():
assert "omim" in ecbs2
assert "orphanet" in ecbs2
assert "hpoa" in ecbs2
assert ecbs2["hpoa"] == 111


#
Expand Down Expand Up @@ -155,7 +174,10 @@ def summary(self):
monitor = ProgressMonitor()

mkg = MetaKnowledgeGraph(
name="Test Graph - Streamed, Stats accessed via File", progress_monitor=monitor
name="Test Graph - Streamed, Stats accessed via File",
progress_monitor=monitor,
node_facet_properties=["provided_by"],
edge_facet_properties=["aggregator_knowledge_source"]
)

t.transform(input_args=input_args, inspector=mkg)
Expand All @@ -166,13 +188,7 @@ def summary(self):

data = json.load(open(output_filename))
assert data["name"] == "Test Graph - Streamed, Stats accessed via File"
assert "NCBIGene" in data["nodes"]["biolink:Gene"]["id_prefixes"]
assert "REACT" in data["nodes"]["biolink:Pathway"]["id_prefixes"]
assert "HP" in data["nodes"]["biolink:PhenotypicFeature"]["id_prefixes"]
assert data["nodes"]["biolink:Gene"]["count"] == 178
assert len(data["nodes"]) == 8
assert len(data["edges"]) == 13

_check_mkg_json_contents(data)
monitor.summary()


Expand Down Expand Up @@ -225,3 +241,33 @@ def test_meta_knowledge_graph_multiple_category_and_predicate_parsing():
assert mkg.get_edge_mapping_count() == 25

assert mkg.get_total_edge_counts_across_mappings() == 100


def test_meta_knowledge_graph_of_complex_graph_data():
"""
Test generate meta knowledge graph operation.
"""
input_args = {
"filename": [
os.path.join(RESOURCE_DIR, "complex_graph_nodes.tsv"),
os.path.join(RESOURCE_DIR, "complex_graph_edges.tsv"),
],
"format": "tsv",
}

transformer = Transformer()

transformer.transform(input_args)

output_filename = os.path.join(TARGET_DIR, "test_meta_knowledge_graph-1.json")

generate_meta_knowledge_graph(
graph=transformer.store.graph,
name="Complex Test Graph",
filename=output_filename,
edge_facet_properties=["aggregator_knowledge_source"]
)

data = json.load(open(output_filename))
assert data["name"] == "Complex Test Graph"
print(f"\n{json.dumps(data, indent=4)}")

0 comments on commit 420699a

Please sign in to comment.