init

lezhnev74 · Jan 10, 2024 · 943b5a1 · 943b5a1
1 parent 25b8682
commit 943b5a1
Show file tree

Hide file tree

Showing 93 changed files with 21,536 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.idea
+local
+dev_notes
diff --git a/Heaplog.jpg b/Heaplog.jpg
diff --git a/README.md b/README.md
@@ -0,0 +1,75 @@
+# Search For Local Log Files
+
+![Heaplog logo](Heaplog.jpg)
+
+// main branch build status
+
+Heaplog is a program that runs in the background, scans and indexes your log files, and allows to search it via Web UI.
+It aims to take small disk space and allow fast searches using its query language (see below).
+
+## Installation
+
+## Configuration
+
+Configuration can be provided as a Yaml file, as well as command arguments (where the latter overwrite the former).
+Configurable keys and values can be seen in [config.go](https://github.com/lezhnev74/heaplog/ui/config.go).
+To populate a new empty file run `heaplog init > heaplog.yml`.
+
+Since there are many formats of log files, you have to provide two things about your file format:
+1. Regular Expression to find individual messages(config key `MessageStartRE`) in your files.
+2. Date format(config key `DateFormat`) to parse its timestamps.
+
+### Use Automatic Format Detection Command
+
+This command `heaplog detect` will ask you to give it a sample log message. It will try to detect date format automatically.
+If it succeeds, you can copy the output config values and go to testing your config.
+
+Sample output:
+```
+$ heaplog detect
+Enter a sample message line:
+[2023-12-31T00:00:03.448201+00:00] production.DEBUG: My message
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ Yay, the date detected above!
+
+Config values:
+MessageStartRE: "(?m)^\[(\d{4}\-\d{2}\-\d{2}\w\d{2}:\d{2}:\d{2}\.\d{6}[+-]\d{2}:\d{2})"
+DateFormat: "2006-01-02T15:04:05.000000-07:00"
+```
+
+### Use ChatGPT
+
+Use the power of AI to do the job for you :) Use this prompt to get a go code from where you can copy-paste the regular 
+expression as well as date format for parsing.
+
+```
+Detect the full timestamp in this log message. 
+Write Go code to parse this date.
+
+[2023-12-31T00:00:03.448201+00:00] production.DEBUG: My message
+```
+
+### Provide Format Manually
+
+The program needs a regular expression that detects the beginning of each message (see [re docs](https://pkg.go.dev/regexp/syntax)).
+In the first matching group it must contain the full date of the message.
+Below is the regular expression that can recognize messages and dates of this format:
+```
+[2023-12-31T00:00:03.448201+00:00] production.DEBUG: My message
+
+(?m)^\[([^\]]+)
+```
+
+### Test Your Config
+Once you have configured the app, run this command to make sure everything is ok:
+`heaplog test <path/to/log.file>`.
+
+## Query Language
+
+## Design
+
+See more about design ideas in this blog post.
+
+## Licence
+
+MIT
diff --git a/common/location.go b/common/location.go
@@ -0,0 +1,81 @@
+package common
+
+import (
+	"log"
+	"slices"
+)
+
+// Location addresses an area of bytes [Min,Max)
+type Location struct {
+	Min, Max int64
+}
+
+func (s Location) Intersects(s2 Location) bool {
+	return s.Min <= s2.Max && s.Max >= s2.Min
+}
+
+// Split slices a segment into many
+func (s Location) Split(maxLen int64) (ret []Location) {
+
+	for {
+		if s.len() <= maxLen {
+			ret = append(ret, s)
+			return
+		}
+
+		ret = append(ret, Location{s.Min, s.Min + maxLen})
+		s = Location{s.Min + maxLen, s.Max}
+	}
+}
+
+func (s Location) len() int64 { return s.Max - s.Min }
+
+func (s Location) Remove(s2 Location) (ret []Location) {
+
+	// valid locations
+	if s.len() < 0 || s2.len() < 0 {
+		log.Panicf("Invalid ranges: %v or %v", s, s2)
+	}
+
+	intersection := Location{max(s.Min, s2.Min), min(s.Max, s2.Max)}
+
+	// If the intersection is empty, then the difference is the union of the two ranges.
+	if intersection.len() < 0 {
+		return []Location{s}
+	}
+
+	// Otherwise, the difference is the two ranges minus the intersection.
+	result := Location{Min: s.Min, Max: intersection.Min}
+	if result.len() > 0 {
+		ret = append(ret, result)
+	}
+	result = Location{Min: intersection.Max, Max: s.Max}
+	if result.len() > 0 {
+		ret = append(ret, result)
+	}
+	return
+}
+
+func MergeSegmentLocations(src []Location) (ret []Location) {
+	slices.SortFunc(src, func(a, b Location) int { return int(a.Min - b.Min) })
+
+	if len(src) < 2 {
+		return src
+	}
+
+	cur := src[0]
+
+	for i := 1; i < len(src); i++ {
+		if src[i].Intersects(cur) {
+			cur = Location{min(cur.Min, src[i].Min), max(cur.Max, src[i].Max)}
+			continue
+		}
+
+		ret = append(ret, cur)
+		cur = src[i]
+	}
+
+	ret = append(ret, cur)
+
+	return
+}
diff --git a/common/query.go b/common/query.go
@@ -0,0 +1,34 @@
+package common
+
+import (
+	"path"
+	"time"
+)
+
+// DataSourceHash this program works with a hashed value only to separate index files
+type DataSourceHash string
+
+func (d DataSourceHash) InvertedIndexRoot(root string) string {
+	return path.Join(root, string(d))
+}
+
+func HashFile(filename string) DataSourceHash {
+	return DataSourceHash(HashString(filename))
+}
+
+type QuerySummary struct {
+	Text, QueryId     string
+	From, To, BuiltAt *time.Time
+	Complete          bool // if the query is still in-flight
+	Total             int
+	MinDoc, MaxDoc    *time.Time
+}
+
+// MatchedMessage is a message matched the query criteria
+type MatchedMessage struct {
+	Id         int64
+	Loc        Location
+	Date       time.Time
+	QueryHash  string
+	DataSource DataSourceHash
+}
diff --git a/common/segments.go b/common/segments.go
@@ -0,0 +1,47 @@
+package common
+
+import (
+	"time"
+)
+
+// The idea behind indexing of source files is simple.
+// During the indexing (ingestion) phase we discover all files and split them in somewhat big segments.
+// In each segment we detect individual messages and save all that to the db.
+// We save segments in the inverted index to save disk space.
+//
+// Later during the search phase we use inverted index to find relevant segments. And then select relevant messages.
+// For each message we evaluate the query expression.
+// As the last step we put matched messages to the storage using only ids (quick ingestion via an appender).
+
+// IndexedSegment describes a region of a file with where all the messages were indexed
+// it contains offsets of all messages found in the segment and dates of the first/last messages
+type IndexedSegment struct {
+	DataSource DataSourceHash
+	Messages   []IndexedMessage
+}
+
+func (is IndexedSegment) Loc() Location {
+	l := Location{0, 0}
+	if len(is.Messages) > 0 {
+		l.Min = is.Messages[0].Loc.Min
+		l.Max = is.Messages[len(is.Messages)-1].Loc.Max
+	}
+	return l
+}
+func (is IndexedSegment) MinDate() time.Time { return is.Messages[0].Date }
+func (is IndexedSegment) MaxDate() time.Time { return is.Messages[len(is.Messages)-1].Date }
+
+type IndexedMessage struct {
+	Id     int64 // ony filled when read from the storage
+	Loc    Location
+	Date   time.Time
+	IsTail bool // detect "tail message"
+}
+
+type IndexedSegmentInfo struct {
+	Id               int64
+	DataSource       DataSourceHash
+	MinDate, MaxDate time.Time
+	From, To         int64
+	Messages         int64
+}