xtuml
diff --git a/‎.gitignore
+1-3 b/‎.gitignore
+1-3
diff --git a/‎.mypy.ini
+1-1 b/‎.mypy.ini
+1-1
diff --git a/‎docs/development/design/3-DN-Retrieving_unique_graphs_from_Otel_data/Investigation_findings.md
+197 b/‎docs/development/design/3-DN-Retrieving_unique_graphs_from_Otel_data/Investigation_findings.md
+197
diff --git a/‎proof_of_concepts/graph_solution_poc/graph_types.py
+33 b/‎proof_of_concepts/graph_solution_poc/graph_types.py
+33
diff --git a/‎proof_of_concepts/graph_solution_poc/memgraph_poc/README.md
+10 b/‎proof_of_concepts/graph_solution_poc/memgraph_poc/README.md
+10
diff --git a/‎proof_of_concepts/graph_solution_poc/memgraph_poc/generate_json_data.py
+98 b/‎proof_of_concepts/graph_solution_poc/memgraph_poc/generate_json_data.py
+98
@@ -167,6 +167,4 @@ outputs
 data/
 
 # vscode
-.vscode
-
-/tel2puml/OtelSpan_output
+.vscode
@@ -1,3 +1,3 @@
 [mypy]
 ignore_errors = True
-exclude = tests
+exclude = tests
@@ -0,0 +1,197 @@
+# Investigation into Storage and Evaluation Methods to Find Unique Graphs within OpenTelemetry Data
+
+## 1. Current Implementation
+
+### 1.1 Data Storage
+- OpenTelemetry (OTel) data is parsed and stored in a SQLite database.
+- Relations between data points are formed within the database structure.
+
+### 1.2 Graph Evaluation Method
+- Unique graph structures are identified through a comparison process.
+- The current method involves sorting graph data into alphanumerical ordered lists before comparison.
+- This approach is not 100% accurate due to limitations in the sorting-based comparison.
+
+### 1.3 Limitations
+- Potential for false positives or negatives in graph isomorphism determination (graph uniqueness).
+
+## 2. Investigation Objectives
+
+The primary goal is to explore alternative methods for storing and evaluating graph data derived from OTel traces. Specifically, we aim to:
+
+1. Identify more accurate methods for determining graph isomorphism.
+2. Explore storage solutions optimised for graph data.
+3. Evaluate the performance and scalability of different approaches.
+4. Assess the ease of implementation and maintenance of new solutions.
+
+## 3. Proposed Solutions for Investigation
+
+### 3.1 Graph Databases
+#### 3.1.1 Neo4j
+- Native graph storage and querying capabilities.
+- Cypher query language for complex graph operations.
+- Built-in visualization tools.
+- Disk-based
+- Written in Java
+- Most popular graph database solution
+
+#### 3.1.2 ArangoDB
+- Supports JSON documents, graphs and key/values.
+- Uses AQL (ArangoDB Query Language)
+- Disk-based
+- Written in C++
+
+#### 3.1.3 Memgraph
+- Memgraph is an open-source graph database built for streaming and compatible with Neo4j.
+- Supports both property graph and RDF models.
+- Written in C++
+- In-memory based (disk based available) for faster querying
+- Uses Cypher as its query language
+- Compatible for use with Networkx
+- Allows custom procedures to be written in Python
+- Networkx has built in methods for finding graph isomorphism
+
+#### 3.1.4 SQLite
+- Lightweight, serverless, self-contained relational database engine
+- Written in C
+- File-based: entire database stored in a single file on disk
+- ACID-compliant
+- Supports standard SQL syntax with some extensions
+- No native graph capabilities, but can be used to store graph-like structures
+- Requires custom implementation for graph operations and traversals
+- Efficient for smaller datasets and embedded applications
+- Limited concurrency support
+- No built-in support for graph algorithms or isomorphism detection
+
+### 3.2 Advanced Graph Algorithms
+#### 3.2.1 Graph Isomorphism Algorithms
+- VF2 algorithm for graph and subgraph isomorphism detection.
+- Ullmann's algorithm for subgraph isomorphism.
+
+#### 3.2.2 Graph Hashing Techniques
+- WL (Weisfeiler-Lehman) graph kernels for structure comparison.
+
+## 4. Evaluation Criteria
+
+For each proposed solution, we will evaluate:
+
+1. Accuracy in identifying unique graph structures.
+2. Query performance for common graph operations.
+3. Scalability with increasing data volume.
+4. Ease of integration with existing OTel data processing pipeline.
+5. Maintenance overhead and long-term viability.
+
+### 4.1 Custom Solution with Memgraph and NetworkX
+
+#### 4.1.1 Memgraph with NetworkX Integration
+- Utilise Memgraph's ability to write custom procedures in Python
+- Implement a procedure to convert Memgraph graphs to NetworkX DiGraph objects
+- Leverage NetworkX's implementation of the Weisfeiler-Lehman graph hashing algorithm
+- Use the generated hash for graph isomorphism comparisons
+
+#### 4.1.2 Findings
+- Memgraph's lack of native graph isomorphism capabilities necessitated a custom solution.
+- To load JSON data into memgraph, the data had to be transformed to a specific format containing data for a "node" and for a "relationship". This created more overhead.
+- Integration with NetworkX provides access to a wide range of graph algorithms.
+- The Weisfeiler-Lehman algorithm implemented in NetworkX offers an efficient and precise method for graph hashing.
+- This approach allows for flexibility in implementing custom graph analysis procedures.
+- Initial tests show promising results for identifying unique graph structures in OTel data.
+- Relatively slow, for 10,000 graphs of depth 3 with 1-3 branches per node resulted in a processing time of around 3-6 mins. This included optimisations such as batch queries. 
+
+
+#### 4.1.3 Algorithm Overview
+
+1. OTel data converted to JSON of nodes and relationships. Data could then be loaded into memgraph using the procedure import_util.json()
+2. Query database for root nodes.
+3. Extract job_name and trace_id from root nodes, mapping job names to trace ids.
+4. Loop over trace_id within each job_name. Query database with a batch of trace_id
+5. Iterate over trace_id within the cypher query using the UNWIND statement. Call the custom procedure within the query that converts a memgraph graph to a networkx digraph, and returns the weisfeler-lehman hash value
+6. Store hashes within a defaultdict(set), mapping job names to unique graph hashes
+
+### 4.2 Custom Solution with SQLite
+
+4.2.1 Performance Findings
+
+- Initial Implementation: The basic SQLite implementation demonstrated a significant performance advantage, computing graphs at least 5 times faster than the Memgraph solution.
+- Custom Hashing Algorithm: We developed a tailored hashing solution that computes a node's hash by combining:
+    * The hash of the node's event_type
+    * The sorted hashes of its children's event_types
+    * This approach proved both efficient and accurate, matching the Memgraph NetworkX solution in identifying unique graphs when tested on a dataset of 10,000 graphs.
+- In-Memory Processing: By loading the dataset into memory, we achieved a 40% speed improvement for processing 10,000 graphs of depth 3 with 1-3 branches per node.
+- Query Optimisation: Implementing batch processing with a size of 500 nodes gave remarkable results:
+    * SQLite solution: 1.6 seconds
+    * Memgraph solution: 200 seconds
+    * This represents a 125x speed improvement over the Memgraph approach.
+
+4.2.2 Architectural Advantages
+
+- Flexible Data Modeling: The SQLite solution allows for easy modifications to the Node model, accommodating changes in data structure without significant refactoring.
+- Reduced Data Manipulation: Unlike the Memgraph solution, which required post-processing of data, the SQLite approach eliminates the need for data transformation, resulting in less overhead and simpler data pipeline.
+- Scalability: The performance gains observed with the SQLite solution suggest better scalability for larger datasets, addressing one of the key objectives of this investigation.
+
+4.2.3 Comparative Analysis
+When compared to the Memgraph solution, the SQLite approach offers:
+
+- Substantially faster processing times (125x improvement in our tests)
+- Simpler data pipeline with reduced manipulation requirements
+- Greater flexibility in data modeling
+- Potential for better scalability with larger datasets
+
+#### 4.1.4 Algorithm Overview
+Database Setup:
+- Create an SQLite database (in-memory for this implementation).
+- Define a Node model representing the structure of OTel data.
+
+Data Loading:
+- Load node data from a JSON file into the SQLite database.
+- Each node contains span_id, trace_id, event_type, job_name, and prev_span_id.
+
+Graph Processing:
+- Retrieve distinct job names from the database.
+- For each job name:
+    * Query for root nodes (nodes with no prev_span_id) for that job.
+    * Process root nodes in batches (batch size = 500).
+
+Graph Hashing:
+
+- For each batch of root nodes:
+    * Retrieve all related nodes for the batch from the database.
+    * Create a mapping of nodes to their children.
+    * For each root node in the batch:
+- Recursively compute a hash for the graph starting from the root node.
+- The hash is based on the node's event_type and its children's hashes.
+
+Unique Graph Identification:
+- Maintain a hash set for each job name.
+- Add computed graph hashes to the corresponding job name's hash set.
+- Keep track of which trace_ids correspond to each unique graph hash.
+
+Result Compilation:
+- Count the total number of unique graph structures across all job names.
+
+Performance Optimisation:
+- Use in-memory SQLite database for faster access.
+- Process nodes in batches to reduce database query overhead.
+- Use efficient hashing algorithm (SHA-256) for graph structure comparison.
+
+
+## 7. Conclusion
+
+This investigation into storage and evaluation methods for OpenTelemetry data has given valuable insights, particularly in comparing SQLite and Memgraph solutions for identifying unique graph structures.
+
+The custom SQLite solution has demonstrated significant performance advantages over the Memgraph approach:
+
+- Speed: SQLite computed graphs 5 times faster than the Memgraph solution with a basic implementation. With optimisations like in-memory processing and query batching, the SQLite approach processed 10,000 graphs in just 1.6 seconds, compared to Memgraph's 200 seconds.
+- Efficiency: The custom hashing algorithm implemented for SQLite proved to be both fast and accurate, matching the Memgraph/NetworkX solution in identifying unique graphs.
+- Flexibility: SQLite allowed for easier modifications to the Node model and required less data manipulation, resulting in reduced overhead compared to Memgraph.
+- Scalability: The SQLite solution showed better performance with larger datasets, addressing one of our key investigation objectives.
+
+While Memgraph offered some advantages, such as native graph storage and compatibility with NetworkX for advanced algorithms, these benefits were outweighed by the performance gains and simplicity of the SQLite approach for our specific use case.
+The investigation also highlighted the importance of custom implementations tailored to specific needs. The SQLite solution, despite lacking native graph capabilities, outperformed the specialised graph database when optimised for our particular requirements.
+
+Moving forward, we recommend:
+
+- Further optimisation and refinement of the SQLite-based solution.
+- Conducting additional scalability tests with even larger datasets.
+- Exploring ways to incorporate some of the beneficial features of graph databases (like visualisation) into our SQLite-based system.
+
+In conclusion, this investigation has provided a clear direction for improving our OTel trace analysis infrastructure, favoring a highly optimised SQLite-based approach over more complex graph database solutions for our current needs.
@@ -0,0 +1,33 @@
+"""TypedDicts for graph solutions"""
+
+from typing import TypedDict, NotRequired
+
+
+class NodeData(TypedDict):
+    """TypedDict for NodeData"""
+
+    id: str
+    labels: list[str]
+    properties: dict[str, str]
+    type: str
+
+
+class NodeRelationshipData(TypedDict):
+    """TypedDict for NodeRelationshipData"""
+
+    id: str
+    end: str
+    start: str
+    label: str
+    properties: dict[str, str]
+    type: str
+
+
+class OtelData(TypedDict):
+    """TypedDict for OtelData"""
+
+    span_id: str
+    trace_id: str
+    event_type: str
+    prev_span_id: NotRequired[str]
+    job_name: str
@@ -0,0 +1,10 @@
+# Note
+
+To be able to connect to the memgraph docker container, you must create a network that both the dev container and memgraph container use.
+
+The following has to be added to the devcontainer.json
+```json
+"runArgs": [
+    "--network=<custom_network>"
+],
+```
@@ -0,0 +1,98 @@
+"""Module to generate JSON data representing OTel data
+"""
+
+import json
+import random
+import string
+
+
+def generate_id() -> str:
+    """Function to generate a random unique id"""
+    return "".join(
+        random.choices(string.ascii_lowercase + string.digits, k=16)
+    )
+
+
+def generate_dummy_data(
+    num_traces: int = 1, max_depth: int = 2
+) -> list[dict[str, str | None]]:
+    """Function to generate dummy OTel data.
+
+    :param num_traces: Number of traces to generate
+    :type num_traces: `int`
+    :param max_depth: The max depth of the tree to be generated
+    :type max_depth: `int`
+    :return: List of dictionaries representing an OTel event
+    :rtype: `list`[`dict`[`str`,`str`]]
+    """
+    data = []
+    event_types = ["A", "B", "C", "D"]
+    job_names = ["job_1", "job_2", "job_3"]
+
+    def generate_spans(
+        job_name: str,
+        trace_id: str,
+        prev_span_id: str | None,
+        current_depth: int,
+    ) -> None:
+        """Recursive function to generate OTel Spans.
+
+        :param job_name: The job name
+        :type job_name: `str`
+        :param trace_id: The trace ID of the Span
+        :type trace_id: `str`
+        :param prev_span_id: The span ID of the parent Span
+        :type prev_span_id: `str` | `None`
+        :param current_depth: The current depth of the tree generated
+        :type current_depth: `int`
+        """
+
+        if current_depth > max_depth:
+            return
+
+        num_branches = random.randint(1, 3)
+        for _ in range(num_branches):
+            span_id = generate_id()
+            event_type = random.choice(event_types)
+
+            span = {
+                "span_id": span_id,
+                "trace_id": trace_id,
+                "event_type": event_type,
+                "prev_span_id": prev_span_id,
+                "job_name": job_name,
+            }
+
+            data.append(span)
+
+            # Recursively generate child spans
+            generate_spans(job_name, trace_id, span_id, current_depth + 1)
+
+    for _ in range(num_traces):
+        trace_id = generate_id()
+        root_span_id = generate_id()
+        job_name = random.choice(job_names)
+
+        # Create root span
+        root_span = {
+            "span_id": root_span_id,
+            "trace_id": trace_id,
+            "event_type": "start",
+            "prev_span_id": None,
+            "job_name": job_name,
+        }
+        data.append(root_span)
+
+        # Generate the rest of the trace
+        generate_spans(job_name, trace_id, root_span_id, 1)
+
+    return data
+
+
+if __name__ == "__main__":
+    # Generate dummy data
+    dummy_data = generate_dummy_data()
+
+    # Save to a file
+    with open("./graph_solution_poc/data/dummy_trace_data.json", "w") as f:
+        json.dump(dummy_data, f, indent=2)