diff --git a/kgx/graph_operations/meta_knowledge_graph.py b/kgx/graph_operations/meta_knowledge_graph.py index 48dae837..5fb04e11 100644 --- a/kgx/graph_operations/meta_knowledge_graph.py +++ b/kgx/graph_operations/meta_knowledge_graph.py @@ -73,13 +73,13 @@ class MetaKnowledgeGraph: error_log = stderr def __init__( - self, - name="", - node_facet_properties: Optional[List] = None, - edge_facet_properties: Optional[List] = None, - progress_monitor: Optional[Callable[[GraphEntityType, List], None]] = None, - error_log=None, - **kwargs, + self, + name="", + node_facet_properties: Optional[List] = None, + edge_facet_properties: Optional[List] = None, + progress_monitor: Optional[Callable[[GraphEntityType, List], None]] = None, + error_log=None, + **kwargs, ): """ MetaKnowledgeGraph constructor. @@ -213,8 +213,8 @@ def __init__(self, category_curie: str, mkg): Biolink Model category CURIE identifier. """ if not ( - _category_curie_regexp.fullmatch(category_curie) - or category_curie == "unknown" + _category_curie_regexp.fullmatch(category_curie) + or category_curie == "unknown" ): raise RuntimeError("Invalid Biolink category CURIE: " + category_curie) @@ -280,7 +280,7 @@ def get_count(self) -> int: return self.category_stats["count"] def get_count_by_source( - self, facet: str = "provided_by", source: str = None + self, facet: str = "provided_by", source: str = None ) -> Dict[str, Any]: """ Parameters @@ -469,8 +469,24 @@ def _compile_triple_source_stats(self, triple: Tuple[str, str, str], data: Dict) data, ) + @staticmethod + def _normalize_relation_field(field) -> Set: + # various non-string iterables... + if isinstance(field, List) or \ + isinstance(field, Tuple) or \ + isinstance(field, Set): + # eliminate duplicate terms + # and normalize to a set + return set(field) + elif isinstance(field, str): + # for uniformity, we coerce + # to a set of one element + return {field} + else: + raise TypeError(f"Unexpected KGX edge 'relation' data field of type '{type(field)}'") + def _process_triple( - self, subject_category: str, predicate: str, object_category: str, data: Dict + self, subject_category: str, predicate: str, object_category: str, data: Dict ): # Process the 'valid' S-P-O triple here... triple = (subject_category, predicate, object_category) @@ -484,11 +500,13 @@ def _process_triple( "count": 0, } - if ( - "relation" in data - and data["relation"] not in self.association_map[triple]["relations"] - ): - self.association_map[triple]["relations"].add(data["relation"]) + # patch for observed defect in some ETL's such as the July 2021 SRI Reference graph + # in which the relation field ends up being a list of terms, sometimes duplicated + + if "relation" in data: + # input data["relation"] is normalized to a Set here + data["relation"] = self._normalize_relation_field(data["relation"]) + self.association_map[triple]["relations"].update(data["relation"]) self.association_map[triple]["count"] += 1 @@ -545,7 +563,6 @@ def analyse_edge(self, u, v, k, data) -> None: return for obj_cat_idx in self.node_catalog[v]: - object_category: str = self.Category.get_category_curie_from_index( obj_cat_idx ) @@ -733,12 +750,12 @@ def get_total_edge_counts_across_mappings(self) -> int: return count def get_edge_count_by_source( - self, - subject_category: str, - predicate: str, - object_category: str, - facet: str = "knowledge_source", - source: Optional[str] = None, + self, + subject_category: str, + predicate: str, + object_category: str, + facet: str = "knowledge_source", + source: Optional[str] = None, ) -> Dict[str, Any]: """ Returns count by source for one S-P-O triple (S, O being Biolink categories; P, a Biolink predicate) @@ -751,8 +768,8 @@ def get_edge_count_by_source( return dict() triple = (subject_category, predicate, object_category) if ( - triple in self.association_map - and "count_by_source" in self.association_map[triple] + triple in self.association_map + and "count_by_source" in self.association_map[triple] ): if facet in self.association_map[triple]["count_by_source"]: if source: @@ -902,10 +919,10 @@ def save(self, file, name: str = None, file_format: str = "json") -> None: yaml.dump(stats, file) -def generate_meta_knowledge_graph(graph: BaseGraph, name: str, filename: str) -> None: +def generate_meta_knowledge_graph(graph: BaseGraph, name: str, filename: str, **kwargs) -> None: """ - Generate a knowledge map that describes the composition of the graph - and write to ``filename``. + Generate a knowledge map that describes + the composition of the graph and write to ``filename``. Parameters ---------- @@ -917,7 +934,7 @@ def generate_meta_knowledge_graph(graph: BaseGraph, name: str, filename: str) -> The file to write the knowledge map to """ - graph_stats = summarize_graph(graph, name) + graph_stats = summarize_graph(graph, name, **kwargs) with open(filename, mode="w") as mkgh: dump(graph_stats, mkgh, indent=4, default=mkg_default) @@ -940,5 +957,5 @@ def summarize_graph(graph: BaseGraph, name: str = None, **kwargs) -> Dict: Dict A TRAPI 1.1 compliant meta knowledge graph of the knowledge graph returned as a dictionary. """ - mkg = MetaKnowledgeGraph(name) + mkg = MetaKnowledgeGraph(name, **kwargs) return mkg.summarize_graph(graph) diff --git a/tests/resources/complex_graph_edges.tsv b/tests/resources/complex_graph_edges.tsv new file mode 100644 index 00000000..b41ad96d --- /dev/null +++ b/tests/resources/complex_graph_edges.tsv @@ -0,0 +1,8 @@ +subject object predicate relation aggregator_knowledge_source +HGNC:10848 NCBIGene:6469 biolink:interacts_with RO:0002434 biogrid +HGNC:10848 HGNC:9398 biolink:interacts_with RO:0002434|RO:0002434|RO:0002434 string +HGNC:10848 HGNC:9399 biolink:interacts_with RO:0002434 string +HGNC:10848 HGNC:16265 biolink:interacts_with RO:0002434 string +HGNC:10848 HGNC:16787 biolink:interacts_with RO:0002434 biogrid +HGNC:10848 GO:0009986 biolink:part_of BFO:0000050 go +HGNC:10848 GO:0097190 biolink:related_to RO:0002331|RO:0002327 go \ No newline at end of file diff --git a/tests/resources/complex_graph_nodes.tsv b/tests/resources/complex_graph_nodes.tsv new file mode 100644 index 00000000..c4721a51 --- /dev/null +++ b/tests/resources/complex_graph_nodes.tsv @@ -0,0 +1,10 @@ +id name category taxon +HGNC:10848 SHH (human) biolink:Gene NCBITaxon:9606 +NCBIGene:6469 SHH biolink:Gene NCBITaxon:9606 +HGNC:9398 OLIG2 biolink:Gene NCBITaxon:9606 +HGNC:9399 PRKCD biolink:Gene NCBITaxon:9606 +HGNC:16265 WNT5B biolink:Gene NCBITaxon:9606 +HGNC:16466 SUFU biolink:Gene NCBITaxon:9606 +HGNC:16787 EDEM3 biolink:Gene NCBITaxon:9606 +GO:0009986 cell surface biolink:CellularComponent +GO:0097190 apoptotic signaling pathway biolink:BiologicalProcess diff --git a/tests/unit/test_cli_utils.py b/tests/unit/test_cli_utils.py index c6e24ad9..13b5c21c 100644 --- a/tests/unit/test_cli_utils.py +++ b/tests/unit/test_cli_utils.py @@ -44,7 +44,15 @@ def test_graph_summary1(): os.path.join(RESOURCE_DIR, "graph_edges.tsv"), ] output = os.path.join(TARGET_DIR, "graph_stats1.yaml") - summary_stats = graph_summary(inputs, "tsv", None, output, report_type="kgx-map") + summary_stats = graph_summary( + inputs, + "tsv", + None, + output, + node_facet_properties=["provided_by"], + edge_facet_properties=["aggregator_knowledge_source"], + report_type="kgx-map" + ) assert os.path.exists(output) assert summary_stats @@ -74,6 +82,8 @@ def test_graph_summary2a(): None, output, report_type="meta-knowledge-graph", + node_facet_properties=["provided_by"], + edge_facet_properties=["aggregator_knowledge_source"], graph_name="Default Meta-Knowledge-Graph", ) @@ -101,6 +111,8 @@ def test_graph_summary2b(): None, output, report_type="meta-knowledge-graph", + node_facet_properties=["provided_by"], + edge_facet_properties=["aggregator_knowledge_source"], report_format="yaml", ) @@ -126,6 +138,8 @@ def test_graph_summary2c(): input_compression=None, output=output, report_type="meta-knowledge-graph", + node_facet_properties=["provided_by"], + edge_facet_properties=["aggregator_knowledge_source"], stream=True, ) diff --git a/tests/unit/test_meta_knowledge_graph.py b/tests/unit/test_meta_knowledge_graph.py index 7806f9c9..138f0a89 100644 --- a/tests/unit/test_meta_knowledge_graph.py +++ b/tests/unit/test_meta_knowledge_graph.py @@ -13,6 +13,24 @@ from tests import RESOURCE_DIR, TARGET_DIR +def _check_mkg_json_contents(data): + assert "NCBIGene" in data["nodes"]["biolink:Gene"]["id_prefixes"] + assert "REACT" in data["nodes"]["biolink:Pathway"]["id_prefixes"] + assert "HP" in data["nodes"]["biolink:PhenotypicFeature"]["id_prefixes"] + assert data["nodes"]["biolink:Gene"]["count"] == 178 + assert len(data["nodes"]) == 8 + assert len(data["edges"]) == 13 + edge1 = data["edges"][0] + assert edge1["subject"] == "biolink:Gene" + assert edge1["predicate"] == "biolink:interacts_with" + assert edge1["object"] == "biolink:Gene" + assert edge1["count"] == 165 + edge1_cbs = edge1["count_by_source"] + assert "aggregator_knowledge_source" in edge1_cbs + edge1_cbs_aks = edge1_cbs["aggregator_knowledge_source"] + assert edge1_cbs_aks["string"] == 159 + + def test_generate_classical_meta_knowledge_graph(): """ Test generate meta knowledge graph operation. @@ -32,17 +50,15 @@ def test_generate_classical_meta_knowledge_graph(): output_filename = os.path.join(TARGET_DIR, "test_meta_knowledge_graph-1.json") generate_meta_knowledge_graph( - transformer.store.graph, "Test Graph", output_filename + graph=transformer.store.graph, + name="Test Graph", + filename=output_filename, + edge_facet_properties=["aggregator_knowledge_source"] ) data = json.load(open(output_filename)) assert data["name"] == "Test Graph" - assert "NCBIGene" in data["nodes"]["biolink:Gene"]["id_prefixes"] - assert "REACT" in data["nodes"]["biolink:Pathway"]["id_prefixes"] - assert "HP" in data["nodes"]["biolink:PhenotypicFeature"]["id_prefixes"] - assert data["nodes"]["biolink:Gene"]["count"] == 178 - assert len(data["nodes"]) == 8 - assert len(data["edges"]) == 13 + _check_mkg_json_contents(data) def test_generate_meta_knowledge_graph_by_stream_inspector(): @@ -61,7 +77,8 @@ def test_generate_meta_knowledge_graph_by_stream_inspector(): transformer = Transformer(stream=True) mkg = MetaKnowledgeGraph( - "Test Graph - Streamed", edge_facet_properties=["aggregator_knowledge_source"] + "Test Graph - Streamed", + edge_facet_properties=["aggregator_knowledge_source"] ) # We configure the Transformer with a data flow inspector @@ -97,6 +114,7 @@ def test_generate_meta_knowledge_graph_by_stream_inspector(): assert len(ecbs1) == 2 assert "biogrid" in ecbs1 assert "string" in ecbs1 + assert ecbs1["string"] == 159 ecbs2 = mkg.get_edge_count_by_source( "biolink:Gene", @@ -108,6 +126,7 @@ def test_generate_meta_knowledge_graph_by_stream_inspector(): assert "omim" in ecbs2 assert "orphanet" in ecbs2 assert "hpoa" in ecbs2 + assert ecbs2["hpoa"] == 111 # @@ -155,7 +174,10 @@ def summary(self): monitor = ProgressMonitor() mkg = MetaKnowledgeGraph( - name="Test Graph - Streamed, Stats accessed via File", progress_monitor=monitor + name="Test Graph - Streamed, Stats accessed via File", + progress_monitor=monitor, + node_facet_properties=["provided_by"], + edge_facet_properties=["aggregator_knowledge_source"] ) t.transform(input_args=input_args, inspector=mkg) @@ -166,13 +188,7 @@ def summary(self): data = json.load(open(output_filename)) assert data["name"] == "Test Graph - Streamed, Stats accessed via File" - assert "NCBIGene" in data["nodes"]["biolink:Gene"]["id_prefixes"] - assert "REACT" in data["nodes"]["biolink:Pathway"]["id_prefixes"] - assert "HP" in data["nodes"]["biolink:PhenotypicFeature"]["id_prefixes"] - assert data["nodes"]["biolink:Gene"]["count"] == 178 - assert len(data["nodes"]) == 8 - assert len(data["edges"]) == 13 - + _check_mkg_json_contents(data) monitor.summary() @@ -225,3 +241,33 @@ def test_meta_knowledge_graph_multiple_category_and_predicate_parsing(): assert mkg.get_edge_mapping_count() == 25 assert mkg.get_total_edge_counts_across_mappings() == 100 + + +def test_meta_knowledge_graph_of_complex_graph_data(): + """ + Test generate meta knowledge graph operation. + """ + input_args = { + "filename": [ + os.path.join(RESOURCE_DIR, "complex_graph_nodes.tsv"), + os.path.join(RESOURCE_DIR, "complex_graph_edges.tsv"), + ], + "format": "tsv", + } + + transformer = Transformer() + + transformer.transform(input_args) + + output_filename = os.path.join(TARGET_DIR, "test_meta_knowledge_graph-1.json") + + generate_meta_knowledge_graph( + graph=transformer.store.graph, + name="Complex Test Graph", + filename=output_filename, + edge_facet_properties=["aggregator_knowledge_source"] + ) + + data = json.load(open(output_filename)) + assert data["name"] == "Complex Test Graph" + print(f"\n{json.dumps(data, indent=4)}")