Skip to content

Commit

Permalink
Merge pull request #23 from tminaorg/communications
Browse files Browse the repository at this point in the history
Communications
  • Loading branch information
k4lizen authored Jul 31, 2023
2 parents 7f807d3 + f982082 commit 240a94b
Show file tree
Hide file tree
Showing 11 changed files with 574 additions and 6 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,6 @@
go.work

brzaguza-bin
brzaguza.log
brzaguza.log

.vscode/*
20 changes: 19 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,33 @@ go 1.20

require (
github.com/alecthomas/kong v0.8.0
github.com/gocolly/colly/v2 v2.1.0
github.com/natefinch/lumberjack v2.0.0+incompatible
github.com/rs/zerolog v1.29.1
github.com/rs/zerolog v1.30.0
golang.org/x/time v0.3.0
)

require (
github.com/BurntSushi/toml v1.3.2 // indirect
github.com/PuerkitoBio/goquery v1.8.1 // indirect
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/antchfx/htmlquery v1.3.0 // indirect
github.com/antchfx/xmlquery v1.3.17 // indirect
github.com/antchfx/xpath v1.2.4 // indirect
github.com/gobwas/glob v0.2.3 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/kennygrant/sanitize v1.2.4 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.19 // indirect
github.com/saintfish/chardet v0.0.0-20230101081208-5e3ef4b5456d // indirect
github.com/sourcegraph/conc v0.3.0
github.com/temoto/robotstxt v1.1.2 // indirect
golang.org/x/net v0.12.0 // indirect
golang.org/x/sys v0.10.0 // indirect
golang.org/x/text v0.11.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/protobuf v1.31.0 // indirect
gopkg.in/natefinch/lumberjack.v2 v2.2.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
)
185 changes: 182 additions & 3 deletions go.sum

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion src/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ var (

// flags
Query string `type:"string" default:"${query_string}" env:"BRZAGUZA_QUERY" help:"Query string used for search"`
MaxPages int `type:"counter" default:"1" env:"BRZAGUZA_MAX_PAGES" help:"Number of pages to search"`
Visit bool `type:"bool" default:"false" env:"BRZAGUZA_VISIT" help:"Should results be visited"`
Log string `type:"path" default:"${log_file}" env:"BRZAGUZA_LOG_FILE" help:"Log file path"`
Verbosity int `type:"counter" default:"0" short:"v" env:"BRZAGUZA_VERBOSITY" help:"Log level verbosity"`
}
Expand Down Expand Up @@ -48,7 +50,7 @@ func setupCli() {
Compact: true,
}),
kong.Vars{
"version": fmt.Sprintf("%s (%s@%s)", Version, GitCommit, Timestamp),
"version": fmt.Sprintf("%v (%v@%v)", Version, GitCommit, Timestamp),
"log_file": "brzaguza.log",
"query_string": "cars for sale in Toronto, Canada",
},
Expand Down
160 changes: 160 additions & 0 deletions src/engines/google/google.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
package google

import (
"context"
"fmt"
"net/url"
"strconv"
"strings"

"github.com/gocolly/colly/v2"
"github.com/rs/zerolog/log"
"github.com/tminaorg/brzaguza/src/rank"
"github.com/tminaorg/brzaguza/src/search/limit"
"github.com/tminaorg/brzaguza/src/search/useragent"
"github.com/tminaorg/brzaguza/src/structures"
)

const seURL string = "https://www.google.com/search?q="
const resPerPage int = 10

func Search(ctx context.Context, query string, relay *structures.Relay, options *structures.Options) error {
if ctx == nil {
ctx = context.Background()
} //^ not necessary as ctx is always passed in search.go, branch predictor will skip this if

if err := limit.RateLimit.Wait(ctx); err != nil {
return err
}

if options.UserAgent == "" {
options.UserAgent = useragent.RandomUserAgent()
}
log.Trace().Msgf("%v", options.UserAgent)

var col *colly.Collector
if options.MaxPages == 1 {
col = colly.NewCollector(colly.MaxDepth(1), colly.UserAgent(options.UserAgent)) // so there is no thread creation overhead
} else {
col = colly.NewCollector(colly.MaxDepth(1), colly.UserAgent(options.UserAgent), colly.Async(true))
}
pagesCol := colly.NewCollector(colly.MaxDepth(1), colly.UserAgent(options.UserAgent), colly.Async(true))

var retError error

pagesCol.OnRequest(func(r *colly.Request) {
if err := ctx.Err(); err != nil { // dont fully understand this
r.Abort()
retError = err
return
}
r.Ctx.Put("originalURL", r.URL.String())
})

pagesCol.OnError(func(r *colly.Response, err error) {
retError = err
})

pagesCol.OnResponse(func(r *colly.Response) {
urll := r.Ctx.Get("originalURL")

setResultResponse(urll, r, relay)
})

col.OnRequest(func(r *colly.Request) {
if err := ctx.Err(); err != nil { // dont fully understand this
r.Abort()
retError = err
return
}
})

col.OnError(func(r *colly.Response, err error) {
retError = err
})

var pageRankCounter []int = make([]int, options.MaxPages*resPerPage)

col.OnHTML("div.g", func(e *colly.HTMLElement) {
dom := e.DOM

linkHref, _ := dom.Find("a").Attr("href")
linkText := strings.TrimSpace(linkHref)
titleText := strings.TrimSpace(dom.Find("div > div > div > a > h3").Text())
descText := strings.TrimSpace(dom.Find("div > div > div > div:first-child > span:first-child").Text())

if linkText != "" && linkText != "#" && titleText != "" {
pageNum := getPageNum(e.Request.URL.String())
res := structures.Result{
Rank: -1,
SEPageRank: pageRankCounter[pageNum],
SEPage: pageNum,
URL: linkText,
Title: titleText,
Description: descText,
}
pageRankCounter[pageNum]++

setResult(&res, relay, options, pagesCol)
}
})

col.Visit(seURL + query + "&start=0")
for i := 1; i < options.MaxPages; i++ {
col.Visit(seURL + query + "&start=" + strconv.Itoa(i*10))
}

col.Wait()
pagesCol.Wait()

return retError
}

func setResult(result *structures.Result, relay *structures.Relay, options *structures.Options, pagesCol *colly.Collector) {
log.Trace().Msgf("Got Result %v: %v", result.Title, result.URL)

relay.Mutex.Lock()
mapRes, exists := relay.ResultMap[result.URL]

if !exists {
relay.ResultMap[result.URL] = result
} else if len(mapRes.Description) < len(result.Description) {
mapRes.Description = result.Description
}
relay.Mutex.Unlock()

if !exists && options.VisitPages {
pagesCol.Visit(result.URL)
}
}

func setResultResponse(link string, response *colly.Response, relay *structures.Relay) {
log.Trace().Msgf("Got Response %v", link)

relay.Mutex.Lock()
mapRes, exists := relay.ResultMap[link]

if !exists {
log.Error().Msgf("URL not in map when adding response! Should not be possible. URL: %v", link)
relay.Mutex.Unlock()
return
}

mapRes.Response = response

resCopy := *mapRes
rankAddr := &(mapRes.Rank)
relay.Mutex.Unlock()
rank.SetRank(&resCopy, rankAddr, &(relay.Mutex)) //copy contains pointer to response
}

func getPageNum(uri string) int {
urll, err := url.Parse(uri)
if err != nil {
fmt.Println(err)
}
qry := urll.Query()
startString := qry.Get("start")
startInt, _ := strconv.Atoi(startString)
return startInt / 10
}
29 changes: 29 additions & 0 deletions src/main.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,35 @@
package main

import (
"fmt"
"time"

"github.com/rs/zerolog/log"
"github.com/tminaorg/brzaguza/src/search"
"github.com/tminaorg/brzaguza/src/structures"
)

func printResults(results []structures.Result) {
for _, r := range results {
fmt.Printf("%v -----\n\t\"%v\"\n\t\"%v\"\n\t\"%v\"\n", r.Rank, r.Title, r.URL, r.Description)
}
}

func main() {
setupCli()
setupLog()

log.Info().
Str("query", cli.Query).
Str("max-pages", fmt.Sprintf("%v", cli.MaxPages)).
Str("visit", fmt.Sprintf("%v", cli.Visit)).
Msg("Started searching")

start := time.Now()
results := search.PerformSearch(cli.Query, cli.MaxPages, cli.Visit)
duration := time.Since(start)

printResults(results)
log.Info().
Msg(fmt.Sprintf("Found %v results in %vms", len(results), duration.Milliseconds()))
}
29 changes: 29 additions & 0 deletions src/rank/rank.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package rank

import (
"sync"

"github.com/rs/zerolog/log"
"github.com/tminaorg/brzaguza/src/structures"
)

// TLDR: you must mutex.Lock when changing *rankAddr, you probably dont need to mutex.RLock() when reading result
// (in reality even *rankAddr shouldnt need a lock, but go would definately complain about simultanious read/write because of it)
func SetRank(result *structures.Result, rankAddr *int, mutex *sync.RWMutex) {

//mutex.RLock()
reqUrl := result.Response.Request.URL.String() //dummy code, if error here, uncomment lock
//mutex.RUnlock()

if reqUrl != result.URL { //dummy code
log.Trace().Msg("Request URL not same as result.URL \\/")
}

rrank := result.SEPage*100 + result.SEPageRank

mutex.Lock()
*rankAddr = rrank
mutex.Unlock()

log.Trace().Msgf("Set rank to %v for %v: %v", rrank, result.Title, result.URL)
}
18 changes: 18 additions & 0 deletions src/search/limit/limit.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package limit

import (
"errors"

"golang.org/x/time/rate"
)

// ErrRateLimited indicates that you have been detected of scraping and temporarily blocked.
// The duration of the block is unspecified.
var ErrRateLimited = errors.New("ratelimited")

// RateLimit sets a global limit to how many requests can be made in a given time interval.
// The default is unlimited (but obviously you will get blocked temporarily if you do too many
// calls too quickly).
//
// See: https://godoc.org/golang.org/x/time/rate#NewLimiter
var RateLimit = rate.NewLimiter(rate.Inf, 0)
55 changes: 55 additions & 0 deletions src/search/search.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package search

import (
"context"
"sort"
"strings"

"github.com/rs/zerolog/log"
"github.com/sourcegraph/conc"
"github.com/tminaorg/brzaguza/src/engines/google"
"github.com/tminaorg/brzaguza/src/structures"
)

func cleanQuery(query string) string {
return strings.Replace(strings.Trim(query, " "), " ", "+", -1)
}

func PerformSearch(query string, maxPages int, visitPages bool) []structures.Result {
relay := structures.Relay{
ResultMap: make(map[string]*structures.Result),
EngineDoneChannel: make(chan bool),
}

options := structures.Options{
MaxPages: maxPages,
VisitPages: visitPages,
}

query = cleanQuery(query)

var worker conc.WaitGroup

worker.Go(func() {
err := google.Search(context.Background(), query, &relay, &options)
if err != nil {
log.Error().Err(err).Msg("Failed searching google.com")
}
})

worker.Wait()

var results []structures.Result = make([]structures.Result, 0, len(relay.ResultMap))
for _, res := range relay.ResultMap {
results = append(results, *res)
}

sort.Sort(structures.ByRank(results))

log.Debug().Msg("All processing done, waiting for closing of goroutines.")
worker.Wait()

log.Debug().Msg("Done! Received All Engines!")

return results
}
24 changes: 24 additions & 0 deletions src/search/useragent/useragent.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package useragent

import (
"math/rand"
"time"
)

// lowercase private list of user agents
var defaultUserAgentList = [...]string{
// Chrome: Windows, MacOS, Linux, Android
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
// Edge: Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.0.0",
// Firefox: Windows
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0",
}

func RandomUserAgent() string {
randSrc := rand.NewSource(time.Now().UnixNano())
randGen := rand.New(randSrc)
return defaultUserAgentList[randGen.Intn(len(defaultUserAgentList))]
}
Loading

0 comments on commit 240a94b

Please sign in to comment.