From 87429f2b3d1b43bdebec1aeb99b10b36f389fd46 Mon Sep 17 00:00:00 2001 From: HAHWUL Date: Sat, 11 Jan 2025 23:43:56 +0900 Subject: [PATCH 1/5] feat(detector): integrate CodeLocator for file tracking during tech detection Signed-off-by: HAHWUL --- src/detector/detector.cr | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/detector/detector.cr b/src/detector/detector.cr index 4a45ad53..245bf236 100644 --- a/src/detector/detector.cr +++ b/src/detector/detector.cr @@ -55,9 +55,12 @@ def detect_techs(base_path : String, options : Hash(String, YAML::Any), passive_ end channel = Channel(String).new + locator = CodeLocator.instance + spawn do Dir.glob("#{base_path}/**/*") do |file| channel.send(file) + locator.push "file_map", file end end From 7a95706d51c862cf34a3c2cbfb44241cbd0d8258 Mon Sep 17 00:00:00 2001 From: HAHWUL Date: Sat, 11 Jan 2025 23:44:08 +0900 Subject: [PATCH 2/5] feat(ollama): enhance endpoint detection by filtering file paths using CodeLocator Signed-off-by: HAHWUL --- .../analyzers/llm_analyzers/ollama.cr | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/analyzer/analyzers/llm_analyzers/ollama.cr b/src/analyzer/analyzers/llm_analyzers/ollama.cr index b9b82d91..82cb875b 100644 --- a/src/analyzer/analyzers/llm_analyzers/ollama.cr +++ b/src/analyzer/analyzers/llm_analyzers/ollama.cr @@ -17,9 +17,28 @@ module Analyzer::AI # Init LLM Instance ollama = LLM::Ollama.new(@llm_url, @model) + locator = CodeLocator.instance + all_paths = locator.all("file_map").join("\n") + + # Filter files that are likely to contain endpoints + filter_prompt = <<-PROMPT + !! Respond only in JSON format. Do not include explanations, comments, or any additional text. !! + --- + Analyze the following list of file paths and identify which files are likely to represent endpoints, including API endpoints, web pages, or static resources. + Return the result as a JSON array of file paths that should be analyzed further. + + File paths: + #{all_paths} + PROMPT + + filter_response = ollama.request(filter_prompt) + filtered_paths = JSON.parse(filter_response.to_s) + logger.debug_sub filter_response + # Source Analysis begin - Dir.glob("#{base_path}/**/*") do |path| + filtered_paths.as_a.each do |jpath| + path = jpath.as_s next if File.directory?(path) relative_path = get_relative_path(base_path, path) From 40404ec2d6583464d0524db3d23d439837cdb5a9 Mon Sep 17 00:00:00 2001 From: HAHWUL Date: Sat, 11 Jan 2025 23:56:07 +0900 Subject: [PATCH 3/5] feat(ollama): improve endpoint filtering logic by adjusting path handling and adding debug logging Signed-off-by: HAHWUL --- .../analyzers/llm_analyzers/ollama.cr | 41 ++++++++++++------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/src/analyzer/analyzers/llm_analyzers/ollama.cr b/src/analyzer/analyzers/llm_analyzers/ollama.cr index 82cb875b..4296304a 100644 --- a/src/analyzer/analyzers/llm_analyzers/ollama.cr +++ b/src/analyzer/analyzers/llm_analyzers/ollama.cr @@ -18,27 +18,38 @@ module Analyzer::AI ollama = LLM::Ollama.new(@llm_url, @model) locator = CodeLocator.instance - all_paths = locator.all("file_map").join("\n") + all_paths = locator.all("file_map") + target_paths = [] of String - # Filter files that are likely to contain endpoints - filter_prompt = <<-PROMPT - !! Respond only in JSON format. Do not include explanations, comments, or any additional text. !! - --- - Analyze the following list of file paths and identify which files are likely to represent endpoints, including API endpoints, web pages, or static resources. - Return the result as a JSON array of file paths that should be analyzed further. + if all_paths.size > 10 + logger.debug_sub "Ollama::Analyzing filtered files" - File paths: - #{all_paths} - PROMPT + # Filter files that are likely to contain endpoints + filter_prompt = <<-PROMPT + !! Respond only in JSON format. Do not include explanations, comments, or any additional text. !! + --- + Analyze the following list of file paths and identify which files are likely to represent endpoints, including API endpoints, web pages, or static resources. + Return the result as a JSON array of file paths that should be analyzed further. - filter_response = ollama.request(filter_prompt) - filtered_paths = JSON.parse(filter_response.to_s) - logger.debug_sub filter_response + File paths: + #{all_paths.join("\n")} + PROMPT + + filter_response = ollama.request(filter_prompt) + filtered_paths = JSON.parse(filter_response.to_s) + logger.debug_sub filter_response + + filtered_paths.as_a.each do |fpath| + target_paths << fpath.as_s + end + else + logger.debug_sub "Ollama::Analyzing all files" + target_paths = Dir.glob("#{base_path}/**/*") + end # Source Analysis begin - filtered_paths.as_a.each do |jpath| - path = jpath.as_s + target_paths.each do |path| next if File.directory?(path) relative_path = get_relative_path(base_path, path) From 5d8b554083a48807513d33f2d67c0dffc8feefd0 Mon Sep 17 00:00:00 2001 From: HAHWUL Date: Sun, 12 Jan 2025 00:00:37 +0900 Subject: [PATCH 4/5] fix(ollama): remove unnecessary file extension from ignore list Signed-off-by: HAHWUL --- src/analyzer/analyzers/llm_analyzers/ollama.cr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/analyzer/analyzers/llm_analyzers/ollama.cr b/src/analyzer/analyzers/llm_analyzers/ollama.cr index 4296304a..56933b14 100644 --- a/src/analyzer/analyzers/llm_analyzers/ollama.cr +++ b/src/analyzer/analyzers/llm_analyzers/ollama.cr @@ -121,7 +121,7 @@ module Analyzer::AI end def ignore_extensions - [".js", ".css", ".html", ".xml", ".json", ".yml", ".yaml", ".md", ".jpg", ".jpeg", ".png", ".gif", ".svg", ".ico", ".eot", ".ttf", ".woff", ".woff2", ".otf", ".mp3", ".mp4", ".avi", ".mov", ".webm", ".zip", ".tar", ".gz", ".7z", ".rar", ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".txt", ".csv", ".log", ".sql", ".bak", ".swp"] + [".css", ".xml", ".json", ".yml", ".yaml", ".md", ".jpg", ".jpeg", ".png", ".gif", ".svg", ".ico", ".eot", ".ttf", ".woff", ".woff2", ".otf", ".mp3", ".mp4", ".avi", ".mov", ".webm", ".zip", ".tar", ".gz", ".7z", ".rar", ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".txt", ".csv", ".log", ".sql", ".bak", ".swp"] end end end From c6f79dbf65b90db63bbd8992aca6dcc87111081b Mon Sep 17 00:00:00 2001 From: HAHWUL Date: Sun, 12 Jan 2025 00:18:06 +0900 Subject: [PATCH 5/5] feat(ollama): refine endpoint analysis by excluding directories and focusing on individual files Signed-off-by: HAHWUL --- src/analyzer/analyzers/llm_analyzers/ollama.cr | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/analyzer/analyzers/llm_analyzers/ollama.cr b/src/analyzer/analyzers/llm_analyzers/ollama.cr index 56933b14..961605e8 100644 --- a/src/analyzer/analyzers/llm_analyzers/ollama.cr +++ b/src/analyzer/analyzers/llm_analyzers/ollama.cr @@ -28,7 +28,8 @@ module Analyzer::AI filter_prompt = <<-PROMPT !! Respond only in JSON format. Do not include explanations, comments, or any additional text. !! --- - Analyze the following list of file paths and identify which files are likely to represent endpoints, including API endpoints, web pages, or static resources. + Analyze the following list of file paths and identify which files are likely to represent endpoints, including API endpoints, web pages, or static resources. + Exclude directories from the analysis and focus only on individual files. Return the result as a JSON array of file paths that should be analyzed further. File paths: