microsoft · eavanvalkenburg · Dec 31, 2024 · Jan 6, 2025 · Jan 6, 2025 · Jan 6, 2025
diff --git a/.gitignore b/.gitignore
@@ -494,6 +494,8 @@ swa-cli.config.json
 
 # python devcontainer
 /python/.devcontainer/*
+/python/samples/demos/graphrag/output/*
+/python/samples/demos/graphrag/ragtest/*
 
 # kiota workspace files
 **/.kiota
@@ -0,0 +1,24 @@
+# Graphrag Sample
+
+## Setup
+To setup this demo, make sure you are rooted in this root folder.
+
+Then run `setup-part-0` for the appropriate platform.
+This install uv, creates a venv with python 3.12 in .venv, activates the venv, and installs the dependencies.
+
+### Linux
+```bash
+./setup-part-0-linux.sh
+```
+### Windows
+```powershell
+setup-part-0-windows.ps1
+```
+
+Next, run `setup-part-1.sh`, this will create the setup directory, downloads a book into it, and then runs the init script, which creates a settings.yaml and .env file.
+
+Next update the .env file with your OpenAI API key as the GRAPHRAG_API_KEY variable, if you want to use Azure OpenAI, then you need to update the settings.yaml accordingly, see the GraphRag docs for more info [here](https://github.com/microsoft/graphrag/blob/main/docs/get_started.md)
+
+Finally, run the `setup-part-2.sh` script, this will run the indexer, this will take a couple of minutes.
+
+Then run `python graphrag_chat.py` to chat with the book, inside that script are some options so feel free to change them to your liking.
diff --git a/python/samples/demos/graphrag/__init__.py b/python/samples/demos/graphrag/__init__.py
@@ -0,0 +1,81 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import logging
+
+from graphrag_service import GraphRagChatCompletion, GraphRagPromptExecutionSettings
+
+from semantic_kernel.connectors.ai.chat_completion_client_base import ChatCompletionClientBase
+from semantic_kernel.contents import ChatHistory, StreamingChatMessageContent
+
+logger = logging.getLogger(__name__)
+
+
+async def chat(service: ChatCompletionClientBase, chat_history: ChatHistory) -> StreamingChatMessageContent | None:
+    try:
+        user_input = input("User:> ")
+    except KeyboardInterrupt:
+        print("\n\nExiting chat...")
+        return None
+    except EOFError:
+        print("\n\nExiting chat...")
+        return None
+
+    if user_input == "exit":
+        print("\n\nExiting chat...")
+        return None
+
+    # Add the user message to the chat history so that the chatbot can respond to it.
+    chat_history.add_user_message(user_input)
+
+    # Capture the chunks of the response and print them as they come in.
+    chunks: list[StreamingChatMessageContent] = []
+    print("Graphrag:> ", end="")
+    async for chunk in service.get_streaming_chat_message_content(
+        chat_history=chat_history,
+        settings=GraphRagPromptExecutionSettings(search_type="local"),
+    ):
+        if chunk:
+            chunks.append(chunk)
+            print(chunk, end="")
+    print("")
+
+    # Combine the chunks into a single message to add to the chat history.
+    full_message = sum(chunks[1:], chunks[0])
+    # Add the chat message to the chat history to keep track of the conversation.
+    chat_history.add_message(full_message)
+    # Return the full message, including context, to the caller.
+    return full_message
+
+
+async def main():
+    print("Welcome to Graphrag, a chatbot that can answer questions about document(s) indexed using GraphRag.")
+    print("Type 'exit' to quit.")
+    # Control whether the full context of the message is printed as well.
+    print_context = False
+
+    graph_rag_chat_completion = GraphRagChatCompletion(project_directory="./ragtest")
+    if not graph_rag_chat_completion.has_loaded():
+        await graph_rag_chat_completion.setup()
+    chat_history = ChatHistory()
+    while True:
+        # The main function returns either the full message, including context or None.
+        message = await chat(graph_rag_chat_completion, chat_history)
+        if message is None:
+            break
+        if print_context and "context" in message.metadata:
+            print("Context:")
+            for part in ["reports", "entities", "relationships", "claims", "sources"]:
+                print(f"  {part}:")
+                for values in message.metadata["context"][part]:
+                    if isinstance(values, dict):
+                        for key, value in values.items():
+                            print(f"    {key}:{value}")
+                    else:
+                        print(f"    {values}")
+            print("done")
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(main())
diff --git a/python/samples/demos/graphrag/graphrag_service.py b/python/samples/demos/graphrag/graphrag_service.py
@@ -0,0 +1,230 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import logging
+from collections.abc import AsyncGenerator
+from typing import Any, Literal
+
+import pandas as pd
+import yaml
+
+import graphrag.api as api
+from graphrag.config.create_graphrag_config import GraphRagConfig, create_graphrag_config
+from graphrag.index.typing import PipelineRunResult
+from semantic_kernel.connectors.ai import PromptExecutionSettings
+from semantic_kernel.connectors.ai.chat_completion_client_base import ChatCompletionClientBase
+from semantic_kernel.contents import AuthorRole, ChatHistory, ChatMessageContent, StreamingChatMessageContent
+
+logger = logging.getLogger(__name__)
+
+
+class GraphRagPromptExecutionSettings(PromptExecutionSettings):
+    response_type: str = "Multiple Paragraphs"
+    search_type: Literal["local", "global", "drift"] = "global"
+
+
+class GraphRagChatCompletion(ChatCompletionClientBase):
+    project_directory: str
+    graphrag_config: GraphRagConfig
+    final_nodes: pd.DataFrame | None = None
+    final_entities: pd.DataFrame | None = None
+    final_communities: pd.DataFrame | None = None
+    final_community_reports: pd.DataFrame | None = None
+    final_documents: pd.DataFrame | None = None
+    final_relationships: pd.DataFrame | None = None
+    final_text_units: pd.DataFrame | None = None
+
+    def __init__(
+        self, project_directory: str, service_id: str = "graph_rag", graphrag_config: GraphRagConfig | None = None
+    ):
+        if not graphrag_config:
+            with open(f"{project_directory}/settings.yaml") as file:
+                graphrag_config = create_graphrag_config(values=yaml.safe_load(file), root_dir=project_directory)
+        super().__init__(
+            service_id=service_id,
+            ai_model_id=service_id,
+            project_directory=project_directory,
+            graphrag_config=graphrag_config,
+        )
+
+    def get_prompt_execution_settings_class(self) -> type[PromptExecutionSettings]:
+        return GraphRagPromptExecutionSettings
+
+    async def setup(self):
+        index_result: list[PipelineRunResult] = await api.build_index(config=self.graphrag_config)
+
+        # index_result is a list of workflows that make up the indexing pipeline that was run
+        for workflow_result in index_result:
+            status = f"error\n{workflow_result.errors}" if workflow_result.errors else "success"
+            print(f"Workflow Name: {workflow_result.workflow}\tStatus: {status}")
+        self.load()
+
+    def has_loaded(self, search_type: Literal["local", "global", "drift"] | None = None) -> bool:
+        if search_type == "local":
+            return all([
+                self.final_nodes is not None,
+                self.final_entities is not None,
+                self.final_communities is not None,
+                self.final_community_reports is not None,
+                self.final_text_units is not None,
+                self.final_relationships is not None,
+            ])
+        if search_type == "global":
+            return all([
+                self.final_nodes is not None,
+                self.final_entities is not None,
+                self.final_communities is not None,
+                self.final_community_reports is not None,
+            ])
+        if search_type == "drift":
+            return all([
+                self.final_nodes is not None,
+                self.final_entities is not None,
+                self.final_communities is not None,
+                self.final_community_reports is not None,
+                self.final_text_units is not None,
+                self.final_relationships is not None,
+            ])
+        return all([
+            self.final_nodes is not None,
+            self.final_entities is not None,
+            self.final_communities is not None,
+            self.final_community_reports is not None,
+            self.final_text_units is not None,
+            self.final_relationships is not None,
+            self.final_documents is not None,
+        ])
+
+    def post_model_init(self, *args, **kwargs):
+        try:
+            self.load()
+        except FileNotFoundError:
+            logger.warning(
+                "Could not load the final nodes, entities, communities, and community reports. Please run setup first."
+            )
+
+    def load(self):
+        """Load the final nodes, entities, communities, and community reports."""
+        self.final_nodes = pd.read_parquet(f"{self.project_directory}/output/create_final_nodes.parquet")
+        self.final_entities = pd.read_parquet(f"{self.project_directory}/output/create_final_entities.parquet")
+        self.final_communities = pd.read_parquet(f"{self.project_directory}/output/create_final_communities.parquet")
+        self.final_community_reports = pd.read_parquet(
+            f"{self.project_directory}/output/create_final_community_reports.parquet"
+        )
+        self.final_text_units = pd.read_parquet(f"{self.project_directory}/output/create_final_text_units.parquet")
+        self.final_relationships = pd.read_parquet(
+            f"{self.project_directory}/output/create_final_relationships.parquet"
+        )
+        self.final_documents = pd.read_parquet(f"{self.project_directory}/output/create_final_documents.parquet")
+
+    async def _inner_get_chat_completion_contents(
+        self,
+        chat_history: "ChatHistory",
+        settings: "PromptExecutionSettings",
+    ) -> list["ChatMessageContent"]:
+        if not isinstance(settings, GraphRagPromptExecutionSettings):
+            settings = self.get_prompt_execution_settings_from_settings(settings)
+        if not self.has_loaded(search_type=settings.search_type):
+            raise ValueError("The required assets have not been loaded, please run setup first.")
+        if settings.search_type == "global":
+            response, context = await api.global_search(
+                config=self.graphrag_config,
+                nodes=self.final_nodes,
+                entities=self.final_entities,
+                communities=self.final_communities,
+                community_reports=self.final_community_reports,
+                community_level=2,
+                dynamic_community_selection=False,
+                response_type=settings.response_type,
+                query=chat_history.messages[-1].content,
+            )
+            if isinstance(response, str):
+                cmc = ChatMessageContent(role=AuthorRole.ASSISTANT, content=response, metadata={"context": context})
+                return [cmc]
+            raise ValueError("Unknown response type.")
+        if settings.search_type == "local":
+            response, context = await api.local_search(
+                config=self.graphrag_config,
+                nodes=self.final_nodes,
+                entities=self.final_entities,
+                community_reports=self.final_community_reports,
+                text_units=self.final_text_units,
+                relationships=self.final_relationships,
+                covariates=None,
+                community_level=2,
+                response_type=settings.response_type,
+                query=chat_history.messages[-1].content,
+            )
+            if isinstance(response, str):
+                cmc = ChatMessageContent(role=AuthorRole.ASSISTANT, content=response, metadata={"context": context})
+                return [cmc]
+            raise ValueError("Unknown response type.")
+        response, context = await api.drift_search(
+            config=self.graphrag_config,
+            nodes=self.final_nodes,
+            entities=self.final_entities,
+            community_reports=self.final_community_reports,
+            text_units=self.final_text_units,
+            relationships=self.final_relationships,
+            community_level=2,
+            query=chat_history.messages[-1].content,
+        )
+        if isinstance(response, str):
+            cmc = ChatMessageContent(role=AuthorRole.ASSISTANT, content=response, metadata={"context": context})
+            return [cmc]
+        raise ValueError("Unknown response type.")
+
+    async def _inner_get_streaming_chat_message_contents(
+        self,
+        chat_history: "ChatHistory",
+        settings: "PromptExecutionSettings",
+    ) -> AsyncGenerator[list["StreamingChatMessageContent"], Any]:
+        if not isinstance(settings, GraphRagPromptExecutionSettings):
+            settings = self.get_prompt_execution_settings_from_settings(settings)
+        if not self.has_loaded(search_type=settings.search_type):
+            raise ValueError("The required assets have not been loaded, please run setup first.")
+        if settings.search_type == "global":
+            responses = api.global_search_streaming(
+                config=self.graphrag_config,
+                nodes=self.final_nodes,
+                entities=self.final_entities,
+                communities=self.final_communities,
+                community_reports=self.final_community_reports,
+                community_level=2,
+                dynamic_community_selection=False,
+                response_type=settings.response_type,
+                query=chat_history.messages[-1].content,
+            )
+            async for response in responses:
+                if isinstance(response, str):
+                    cmc = StreamingChatMessageContent(choice_index=0, role=AuthorRole.ASSISTANT, content=response)
+                    yield [cmc]
+                if isinstance(response, dict):
+                    cmc = StreamingChatMessageContent(
+                        choice_index=0, content="", role=AuthorRole.ASSISTANT, metadata={"context": response}
+                    )
+                    yield [cmc]
+            return
+        elif settings.search_type == "local":
+            responses = api.local_search_streaming(
+                config=self.graphrag_config,
+                nodes=self.final_nodes,
+                entities=self.final_entities,
+                community_reports=self.final_community_reports,
+                text_units=self.final_text_units,
+                relationships=self.final_relationships,
+                covariates=None,
+                community_level=2,
+                response_type=settings.response_type,
+                query=chat_history.messages[-1].content,
+            )
+            async for response in responses:
+                if isinstance(response, str):
+                    cmc = StreamingChatMessageContent(choice_index=0, role=AuthorRole.ASSISTANT, content=response)
+                    yield [cmc]
+                if isinstance(response, dict):
+                    cmc = StreamingChatMessageContent(
+                        choice_index=0, content="", role=AuthorRole.ASSISTANT, metadata={"context": response}
+                    )
+                    yield [cmc]
+            return
+        raise NotImplementedError("Drift search is not available when streaming.")
@@ -0,0 +1,3 @@
+graphrag
+semantic-kernel[azure]
+pandas
@@ -0,0 +1,7 @@
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+uv venv --python 3.11
+
+source ./venv/bin/activate
+
+uv pip install -r requirements.txt
@@ -0,0 +1,8 @@
+powershell -ExecutionPolicy ByPass -c "irm https://astral.sh/uv/install.ps1 | iex"
+
+uv venv --python 3.11
+
+.venv\Scripts\activate
+
+uv pip install -r requirements.txt
+
@@ -0,0 +1,8 @@
+# Create the folder containing your GraphRag index
+mkdir ./ragtest
+
+# Download a book to use as your input
+curl https://www.gutenberg.org/cache/epub/24022/pg24022.txt -o ./ragtest/input/book.txt
+
+# Initialize the GraphRag setup
+graphrag init --root ./ragtest
@@ -0,0 +1,3 @@
+# Make sure you have added your OpenAI key to the .env file
+# and made any other changes you want to the config file
+graphrag index --root ./ragtest