-
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #23 from tminaorg/communications
Communications
- Loading branch information
Showing
11 changed files
with
574 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,4 +21,6 @@ | |
go.work | ||
|
||
brzaguza-bin | ||
brzaguza.log | ||
brzaguza.log | ||
|
||
.vscode/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,160 @@ | ||
package google | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"net/url" | ||
"strconv" | ||
"strings" | ||
|
||
"github.com/gocolly/colly/v2" | ||
"github.com/rs/zerolog/log" | ||
"github.com/tminaorg/brzaguza/src/rank" | ||
"github.com/tminaorg/brzaguza/src/search/limit" | ||
"github.com/tminaorg/brzaguza/src/search/useragent" | ||
"github.com/tminaorg/brzaguza/src/structures" | ||
) | ||
|
||
const seURL string = "https://www.google.com/search?q=" | ||
const resPerPage int = 10 | ||
|
||
func Search(ctx context.Context, query string, relay *structures.Relay, options *structures.Options) error { | ||
if ctx == nil { | ||
ctx = context.Background() | ||
} //^ not necessary as ctx is always passed in search.go, branch predictor will skip this if | ||
|
||
if err := limit.RateLimit.Wait(ctx); err != nil { | ||
return err | ||
} | ||
|
||
if options.UserAgent == "" { | ||
options.UserAgent = useragent.RandomUserAgent() | ||
} | ||
log.Trace().Msgf("%v", options.UserAgent) | ||
|
||
var col *colly.Collector | ||
if options.MaxPages == 1 { | ||
col = colly.NewCollector(colly.MaxDepth(1), colly.UserAgent(options.UserAgent)) // so there is no thread creation overhead | ||
} else { | ||
col = colly.NewCollector(colly.MaxDepth(1), colly.UserAgent(options.UserAgent), colly.Async(true)) | ||
} | ||
pagesCol := colly.NewCollector(colly.MaxDepth(1), colly.UserAgent(options.UserAgent), colly.Async(true)) | ||
|
||
var retError error | ||
|
||
pagesCol.OnRequest(func(r *colly.Request) { | ||
if err := ctx.Err(); err != nil { // dont fully understand this | ||
r.Abort() | ||
retError = err | ||
return | ||
} | ||
r.Ctx.Put("originalURL", r.URL.String()) | ||
}) | ||
|
||
pagesCol.OnError(func(r *colly.Response, err error) { | ||
retError = err | ||
}) | ||
|
||
pagesCol.OnResponse(func(r *colly.Response) { | ||
urll := r.Ctx.Get("originalURL") | ||
|
||
setResultResponse(urll, r, relay) | ||
}) | ||
|
||
col.OnRequest(func(r *colly.Request) { | ||
if err := ctx.Err(); err != nil { // dont fully understand this | ||
r.Abort() | ||
retError = err | ||
return | ||
} | ||
}) | ||
|
||
col.OnError(func(r *colly.Response, err error) { | ||
retError = err | ||
}) | ||
|
||
var pageRankCounter []int = make([]int, options.MaxPages*resPerPage) | ||
|
||
col.OnHTML("div.g", func(e *colly.HTMLElement) { | ||
dom := e.DOM | ||
|
||
linkHref, _ := dom.Find("a").Attr("href") | ||
linkText := strings.TrimSpace(linkHref) | ||
titleText := strings.TrimSpace(dom.Find("div > div > div > a > h3").Text()) | ||
descText := strings.TrimSpace(dom.Find("div > div > div > div:first-child > span:first-child").Text()) | ||
|
||
if linkText != "" && linkText != "#" && titleText != "" { | ||
pageNum := getPageNum(e.Request.URL.String()) | ||
res := structures.Result{ | ||
Rank: -1, | ||
SEPageRank: pageRankCounter[pageNum], | ||
SEPage: pageNum, | ||
URL: linkText, | ||
Title: titleText, | ||
Description: descText, | ||
} | ||
pageRankCounter[pageNum]++ | ||
|
||
setResult(&res, relay, options, pagesCol) | ||
} | ||
}) | ||
|
||
col.Visit(seURL + query + "&start=0") | ||
for i := 1; i < options.MaxPages; i++ { | ||
col.Visit(seURL + query + "&start=" + strconv.Itoa(i*10)) | ||
} | ||
|
||
col.Wait() | ||
pagesCol.Wait() | ||
|
||
return retError | ||
} | ||
|
||
func setResult(result *structures.Result, relay *structures.Relay, options *structures.Options, pagesCol *colly.Collector) { | ||
log.Trace().Msgf("Got Result %v: %v", result.Title, result.URL) | ||
|
||
relay.Mutex.Lock() | ||
mapRes, exists := relay.ResultMap[result.URL] | ||
|
||
if !exists { | ||
relay.ResultMap[result.URL] = result | ||
} else if len(mapRes.Description) < len(result.Description) { | ||
mapRes.Description = result.Description | ||
} | ||
relay.Mutex.Unlock() | ||
|
||
if !exists && options.VisitPages { | ||
pagesCol.Visit(result.URL) | ||
} | ||
} | ||
|
||
func setResultResponse(link string, response *colly.Response, relay *structures.Relay) { | ||
log.Trace().Msgf("Got Response %v", link) | ||
|
||
relay.Mutex.Lock() | ||
mapRes, exists := relay.ResultMap[link] | ||
|
||
if !exists { | ||
log.Error().Msgf("URL not in map when adding response! Should not be possible. URL: %v", link) | ||
relay.Mutex.Unlock() | ||
return | ||
} | ||
|
||
mapRes.Response = response | ||
|
||
resCopy := *mapRes | ||
rankAddr := &(mapRes.Rank) | ||
relay.Mutex.Unlock() | ||
rank.SetRank(&resCopy, rankAddr, &(relay.Mutex)) //copy contains pointer to response | ||
} | ||
|
||
func getPageNum(uri string) int { | ||
urll, err := url.Parse(uri) | ||
if err != nil { | ||
fmt.Println(err) | ||
} | ||
qry := urll.Query() | ||
startString := qry.Get("start") | ||
startInt, _ := strconv.Atoi(startString) | ||
return startInt / 10 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,35 @@ | ||
package main | ||
|
||
import ( | ||
"fmt" | ||
"time" | ||
|
||
"github.com/rs/zerolog/log" | ||
"github.com/tminaorg/brzaguza/src/search" | ||
"github.com/tminaorg/brzaguza/src/structures" | ||
) | ||
|
||
func printResults(results []structures.Result) { | ||
for _, r := range results { | ||
fmt.Printf("%v -----\n\t\"%v\"\n\t\"%v\"\n\t\"%v\"\n", r.Rank, r.Title, r.URL, r.Description) | ||
} | ||
} | ||
|
||
func main() { | ||
setupCli() | ||
setupLog() | ||
|
||
log.Info(). | ||
Str("query", cli.Query). | ||
Str("max-pages", fmt.Sprintf("%v", cli.MaxPages)). | ||
Str("visit", fmt.Sprintf("%v", cli.Visit)). | ||
Msg("Started searching") | ||
|
||
start := time.Now() | ||
results := search.PerformSearch(cli.Query, cli.MaxPages, cli.Visit) | ||
duration := time.Since(start) | ||
|
||
printResults(results) | ||
log.Info(). | ||
Msg(fmt.Sprintf("Found %v results in %vms", len(results), duration.Milliseconds())) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
package rank | ||
|
||
import ( | ||
"sync" | ||
|
||
"github.com/rs/zerolog/log" | ||
"github.com/tminaorg/brzaguza/src/structures" | ||
) | ||
|
||
// TLDR: you must mutex.Lock when changing *rankAddr, you probably dont need to mutex.RLock() when reading result | ||
// (in reality even *rankAddr shouldnt need a lock, but go would definately complain about simultanious read/write because of it) | ||
func SetRank(result *structures.Result, rankAddr *int, mutex *sync.RWMutex) { | ||
|
||
//mutex.RLock() | ||
reqUrl := result.Response.Request.URL.String() //dummy code, if error here, uncomment lock | ||
//mutex.RUnlock() | ||
|
||
if reqUrl != result.URL { //dummy code | ||
log.Trace().Msg("Request URL not same as result.URL \\/") | ||
} | ||
|
||
rrank := result.SEPage*100 + result.SEPageRank | ||
|
||
mutex.Lock() | ||
*rankAddr = rrank | ||
mutex.Unlock() | ||
|
||
log.Trace().Msgf("Set rank to %v for %v: %v", rrank, result.Title, result.URL) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
package limit | ||
|
||
import ( | ||
"errors" | ||
|
||
"golang.org/x/time/rate" | ||
) | ||
|
||
// ErrRateLimited indicates that you have been detected of scraping and temporarily blocked. | ||
// The duration of the block is unspecified. | ||
var ErrRateLimited = errors.New("ratelimited") | ||
|
||
// RateLimit sets a global limit to how many requests can be made in a given time interval. | ||
// The default is unlimited (but obviously you will get blocked temporarily if you do too many | ||
// calls too quickly). | ||
// | ||
// See: https://godoc.org/golang.org/x/time/rate#NewLimiter | ||
var RateLimit = rate.NewLimiter(rate.Inf, 0) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
package search | ||
|
||
import ( | ||
"context" | ||
"sort" | ||
"strings" | ||
|
||
"github.com/rs/zerolog/log" | ||
"github.com/sourcegraph/conc" | ||
"github.com/tminaorg/brzaguza/src/engines/google" | ||
"github.com/tminaorg/brzaguza/src/structures" | ||
) | ||
|
||
func cleanQuery(query string) string { | ||
return strings.Replace(strings.Trim(query, " "), " ", "+", -1) | ||
} | ||
|
||
func PerformSearch(query string, maxPages int, visitPages bool) []structures.Result { | ||
relay := structures.Relay{ | ||
ResultMap: make(map[string]*structures.Result), | ||
EngineDoneChannel: make(chan bool), | ||
} | ||
|
||
options := structures.Options{ | ||
MaxPages: maxPages, | ||
VisitPages: visitPages, | ||
} | ||
|
||
query = cleanQuery(query) | ||
|
||
var worker conc.WaitGroup | ||
|
||
worker.Go(func() { | ||
err := google.Search(context.Background(), query, &relay, &options) | ||
if err != nil { | ||
log.Error().Err(err).Msg("Failed searching google.com") | ||
} | ||
}) | ||
|
||
worker.Wait() | ||
|
||
var results []structures.Result = make([]structures.Result, 0, len(relay.ResultMap)) | ||
for _, res := range relay.ResultMap { | ||
results = append(results, *res) | ||
} | ||
|
||
sort.Sort(structures.ByRank(results)) | ||
|
||
log.Debug().Msg("All processing done, waiting for closing of goroutines.") | ||
worker.Wait() | ||
|
||
log.Debug().Msg("Done! Received All Engines!") | ||
|
||
return results | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
package useragent | ||
|
||
import ( | ||
"math/rand" | ||
"time" | ||
) | ||
|
||
// lowercase private list of user agents | ||
var defaultUserAgentList = [...]string{ | ||
// Chrome: Windows, MacOS, Linux, Android | ||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36", | ||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", | ||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36", | ||
// Edge: Windows | ||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36 Edg/114.0.0.0", | ||
// Firefox: Windows | ||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/113.0", | ||
} | ||
|
||
func RandomUserAgent() string { | ||
randSrc := rand.NewSource(time.Now().UnixNano()) | ||
randGen := rand.New(randSrc) | ||
return defaultUserAgentList[randGen.Intn(len(defaultUserAgentList))] | ||
} |
Oops, something went wrong.