Skip to content

Commit

Permalink
Merge pull request #315 from hearchco/as/fix/options
Browse files Browse the repository at this point in the history
fix(options)!: default values and validations from the get go
  • Loading branch information
aleksasiriski authored May 28, 2024
2 parents ced3fab + 4c9d220 commit 93a4bcb
Show file tree
Hide file tree
Showing 24 changed files with 135 additions and 156 deletions.
17 changes: 8 additions & 9 deletions src/cli/climode.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ func Run(flags Flags, db cache.DB, conf config.Config) {
log.Info().
Str("queryAnon", anonymize.String(flags.Query)).
Str("queryHash", anonymize.HashToSHA256B64(flags.Query)).
Int("maxPages", flags.MaxPages).
Int("maxPages", flags.PagesMax).
Bool("visit", flags.Visit).
Msg("Started hearching")

Expand All @@ -66,17 +66,16 @@ func Run(flags Flags, db cache.DB, conf config.Config) {
log.Fatal().Err(err).Msg("Invalid category")
}

// all of these have default values set and are validated beforehand
options := engines.Options{
Pages: engines.Pages{
Start: flags.StartPage,
Max: flags.MaxPages,
},
VisitPages: flags.Visit,
Category: categoryName,
UserAgent: flags.UserAgent,
Locale: flags.Locale,
SafeSearch: flags.SafeSearch,
Mobile: flags.Mobile,
Pages: engines.Pages{
Start: flags.PagesStart,
Max: flags.PagesMax,
},
Locale: flags.Locale,
Category: categoryName,
}

start := time.Now()
Expand Down
40 changes: 27 additions & 13 deletions src/cli/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"

"github.com/alecthomas/kong"
"github.com/hearchco/hearchco/src/gotypelimits"
"github.com/hearchco/hearchco/src/search/category"
"github.com/hearchco/hearchco/src/search/engines"
"github.com/rs/zerolog/log"
Expand All @@ -27,34 +28,47 @@ func Setup() Flags {
)

if err := ctx.Validate(); err != nil {
log.Panic().Err(err).Msg("cli.Setup(): failed parsing cli") // panic is also run inside the library. when does this happen?
log.Panic().Caller().Err(err).Msg("failed parsing cli") // panic is also run inside the library. when does this happen?
// ^PANIC
}

if locErr := engines.ValidateLocale(cli.Locale); locErr != nil {
log.Fatal().Err(locErr).Msg("cli.Setup(): invalid locale flag")
if cli.Query == "" {
log.Fatal().Caller().Msg("query cannot be empty or whitespace")
// ^FATAL
}

if _, err := category.FromString(cli.Category); err != nil {
log.Fatal().Msg("cli.Setup(): invalid category flag")
// TODO: make upper limit configurable
pagesMaxUpperLimit := 10
if cli.PagesMax < 1 || cli.PagesMax > pagesMaxUpperLimit {
log.Fatal().
Caller().
Int("pages", cli.PagesMax).
Int("min", 1).
Int("max", pagesMaxUpperLimit).
Msg("pages value out of range")
// ^FATAL
}

if cli.StartPage < 1 {
if cli.PagesStart < 1 || cli.PagesStart > gotypelimits.MaxInt-pagesMaxUpperLimit {
log.Fatal().
Int("startpage", cli.StartPage).
Msg("cli.Setup(): invalid start page flag (must be >= 1)")
Caller().
Int("start", cli.PagesStart).
Int("min", 1).
Int("max", gotypelimits.MaxInt-pagesMaxUpperLimit).
Msg("start value out of range")
// ^FATAL
} else {
// since it's >=1, we decrement it to match the 0-based index
cli.StartPage -= 1
cli.PagesStart -= 1
}

if cli.MaxPages < 1 {
log.Fatal().
Int("maxpages", cli.MaxPages).
Msg("cli.Setup(): invalid max pages flag (must be >= 1)")
if err := engines.ValidateLocale(cli.Locale); err != nil {
log.Fatal().Caller().Err(err).Msg("invalid locale flag")
// ^FATAL
}

if _, err := category.FromString(cli.Category); err != nil {
log.Fatal().Caller().Msg("invalid category flag")
// ^FATAL
}

Expand Down
10 changes: 4 additions & 6 deletions src/cli/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,12 @@ type Flags struct {
DataDirPath string `type:"path" default:"${data_folder}" env:"HEARCHCO_DATA_DIR" help:"Data folder path"`
Verbosity int8 `type:"counter" default:"0" short:"v" env:"HEARCHCO_VERBOSITY" help:"Log level verbosity"`
// options
StartPage int `type:"counter" default:"1" env:"HEARCHCO_START_PAGE" help:"Page from which to start searching (>=1)"`
MaxPages int `type:"counter" default:"1" env:"HEARCHCO_MAX_PAGES" help:"Number of pages to search (>=1)"`
Visit bool `type:"bool" default:"false" env:"HEARCHCO_VISIT" help:"Should results be visited"`
Category string `type:"string" default:"" short:"c" env:"HEARCHCO_CATEGORY" help:"Search result category. Can also be supplied through the query (e.g. \"!info smartphone\"). Supported values: info[/wiki], science[/sci], news, blog, surf, newnews[/nnews]"`
UserAgent string `type:"string" default:"" env:"HEARCHCO_USER_AGENT" help:"The user agent"`
Locale string `type:"string" default:"" env:"HEARCHCO_LOCALE" help:"Locale string specifying result language and region preference. The format is en_US"`
SafeSearch bool `type:"bool" default:"false" env:"HEARCHCO_SAFE_SEARCH" help:"Whether to use safe search"`
Mobile bool `type:"bool" default:"false" env:"HEARCHCO_MOBILE" help:"Whether to gear results towards mobile"`
PagesStart int `type:"counter" default:"1" env:"HEARCHCO_PAGES_START" help:"Page from which to start searching (>=1)"`
PagesMax int `type:"counter" default:"1" env:"HEARCHCO_PAGES_MAX" help:"Number of pages to search (>=1)"`
Locale string `type:"string" default:"en_US" env:"HEARCHCO_LOCALE" help:"Locale string specifying result language and region preference. The format is en_US"`
Category string `type:"string" default:"general" short:"c" env:"HEARCHCO_CATEGORY" help:"Search result category. Can also be supplied through the query (e.g. \"!images smartphone\")."`
// profiler
CPUProfile bool `type:"bool" default:"false" env:"HEARCHCO_CPUPROFILE" help:"Use cpu profiling"`
HeapProfile bool `type:"bool" default:"false" env:"HEARCHCO_HEAPPROFILE" help:"Use heap profiling"`
Expand Down
84 changes: 34 additions & 50 deletions src/router/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ import (

// returns response body, header and error
func Search(w http.ResponseWriter, r *http.Request, db cache.DB, ttlConf config.TTL, settings map[engines.Name]config.Settings, categories map[category.Name]config.Category, salt string) error {
err := r.ParseForm()
if err != nil {
// parse form data (including query params)
if err := r.ParseForm(); err != nil {
// server error
werr := writeResponseJSON(w, http.StatusInternalServerError, ErrorResponse{
Message: "failed to parse form",
Expand All @@ -30,18 +30,7 @@ func Search(w http.ResponseWriter, r *http.Request, db cache.DB, ttlConf config.
return err
}

params := r.Form

query := strings.TrimSpace(getParamOrDefault(params, "q"))
pagesStartS := getParamOrDefault(params, "start", "1")
pagesMaxS := getParamOrDefault(params, "pages", "1")
visitPagesS := getParamOrDefault(params, "deep", "false")
locale := getParamOrDefault(params, "locale", config.DefaultLocale)
categoryS := getParamOrDefault(params, "category", "")
userAgent := getParamOrDefault(params, "useragent", "")
safeSearchS := getParamOrDefault(params, "safesearch", "false")
mobileS := getParamOrDefault(params, "mobile", "false")

query := strings.TrimSpace(getParamOrDefault(r.Form, "q")) // query is required
if query == "" {
// user error
return writeResponseJSON(w, http.StatusBadRequest, ErrorResponse{
Expand All @@ -50,6 +39,27 @@ func Search(w http.ResponseWriter, r *http.Request, db cache.DB, ttlConf config.
})
}

visitPagesS := getParamOrDefault(r.Form, "deep", "false")
visitPages, err := strconv.ParseBool(visitPagesS)
if err != nil {
// user error
return writeResponseJSON(w, http.StatusUnprocessableEntity, ErrorResponse{
Message: "cannot convert deep value to bool",
Value: fmt.Sprintf("%v", err),
})
}

safeSearchS := getParamOrDefault(r.Form, "safesearch", "false")
safeSearch, err := strconv.ParseBool(safeSearchS)
if err != nil {
// user error
return writeResponseJSON(w, http.StatusUnprocessableEntity, ErrorResponse{
Message: "cannot convert safesearch value to bool",
Value: fmt.Sprintf("%v", err),
})
}

pagesMaxS := getParamOrDefault(r.Form, "pages", "1")
pagesMax, err := strconv.Atoi(pagesMaxS)
if err != nil {
// user error
Expand All @@ -58,7 +68,6 @@ func Search(w http.ResponseWriter, r *http.Request, db cache.DB, ttlConf config.
Value: fmt.Sprintf("%v", err),
})
}

// TODO: make upper limit configurable
pagesMaxUpperLimit := 10
if pagesMax < 1 || pagesMax > pagesMaxUpperLimit {
Expand All @@ -69,6 +78,7 @@ func Search(w http.ResponseWriter, r *http.Request, db cache.DB, ttlConf config.
})
}

pagesStartS := getParamOrDefault(r.Form, "start", "1")
pagesStart, err := strconv.Atoi(pagesStartS)
if err != nil {
// user error
Expand All @@ -77,7 +87,6 @@ func Search(w http.ResponseWriter, r *http.Request, db cache.DB, ttlConf config.
Value: fmt.Sprintf("%v", err),
})
}

// make sure that pagesStart can be safely added to pagesMax
if pagesStart < 1 || pagesStart > gotypelimits.MaxInt-pagesMaxUpperLimit {
// user error
Expand All @@ -90,15 +99,7 @@ func Search(w http.ResponseWriter, r *http.Request, db cache.DB, ttlConf config.
pagesStart -= 1
}

visitPages, err := strconv.ParseBool(visitPagesS)
if err != nil {
// user error
return writeResponseJSON(w, http.StatusUnprocessableEntity, ErrorResponse{
Message: "cannot convert deep value to bool",
Value: fmt.Sprintf("%v", err),
})
}

locale := getParamOrDefault(r.Form, "locale", config.DefaultLocale)
err = engines.ValidateLocale(locale)
if err != nil {
// user error
Expand All @@ -108,6 +109,7 @@ func Search(w http.ResponseWriter, r *http.Request, db cache.DB, ttlConf config.
})
}

categoryS := getParamOrDefault(r.Form, "category", category.GENERAL.String())
categoryName, err := category.FromString(categoryS)
if err != nil {
// user error
Expand All @@ -117,38 +119,19 @@ func Search(w http.ResponseWriter, r *http.Request, db cache.DB, ttlConf config.
})
}

safeSearch, err := strconv.ParseBool(safeSearchS)
if err != nil {
// user error
return writeResponseJSON(w, http.StatusUnprocessableEntity, ErrorResponse{
Message: "cannot convert safesearch value to bool",
Value: fmt.Sprintf("%v", err),
})
}

mobile, err := strconv.ParseBool(mobileS)
if err != nil {
// user error
return writeResponseJSON(w, http.StatusUnprocessableEntity, ErrorResponse{
Message: "cannot convert mobile value to bool",
Value: fmt.Sprintf("%v", err),
})
}

// all of these have default values set and are validated beforehand
options := engines.Options{
VisitPages: visitPages,
SafeSearch: safeSearch,
Pages: engines.Pages{
Start: pagesStart,
Max: pagesMax,
},
VisitPages: visitPages,
Category: categoryName,
UserAgent: userAgent,
Locale: locale,
SafeSearch: safeSearch,
Mobile: mobile,
Locale: locale,
Category: categoryName,
}

// search for results in db and web, afterwards return JSON
// search for results
results, foundInDB := search.Search(query, options, db, categories[options.Category], settings, salt)

// send response as soon as possible
Expand All @@ -160,6 +143,7 @@ func Search(w http.ResponseWriter, r *http.Request, db cache.DB, ttlConf config.
err = writeResponseJSON(w, http.StatusOK, resultsOutput)
}

// TODO: this doesn't work on AWS Lambda because the response is already sent (which terminates the process)
// don't return immediately, we want to cache results and update them if necessary
search.CacheAndUpdateResults(query, options, db, ttlConf, categories[options.Category], settings, results, foundInDB, salt)

Expand Down
4 changes: 4 additions & 0 deletions src/search/category/name.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,7 @@ const (
SCIENCE Name = "science"
THOROUGH Name = "thorough"
)

func (cat Name) String() string {
return string(cat)
}
34 changes: 28 additions & 6 deletions src/search/engines/_sedefaults/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,35 @@ import (
"github.com/hearchco/hearchco/src/config"
"github.com/hearchco/hearchco/src/search/bucket"
"github.com/hearchco/hearchco/src/search/engines"
"github.com/hearchco/hearchco/src/search/useragent"
"github.com/rs/zerolog/log"
)

// it's okay to return pointers to collectors since colly.NewCollector() returns a pointer
func InitializeCollectors(ctx context.Context, engineName engines.Name, options engines.Options, settings config.Settings, timings config.CategoryTimings, relay *bucket.Relay) (*colly.Collector, *colly.Collector) {
col := colly.NewCollector(colly.MaxDepth(1), colly.UserAgent(options.UserAgent), colly.Async())
pagesCol := colly.NewCollector(colly.MaxDepth(1), colly.UserAgent(options.UserAgent), colly.Async())
// get random user agent and corresponding Sec-Ch-Ua header
userAgent, secChUa := useragent.RandomUserAgentWithHeader()

// create collectors
col := colly.NewCollector(
colly.Async(),
colly.MaxDepth(1),
colly.UserAgent(userAgent),
colly.IgnoreRobotsTxt(),
colly.Headers(map[string]string{
"Sec-Ch-Ua": secChUa,
}),
)
pagesCol := colly.NewCollector(
colly.Async(),
colly.MaxDepth(1),
colly.UserAgent(userAgent),
colly.IgnoreRobotsTxt(),
colly.Headers(map[string]string{
"Sec-Ch-Ua": secChUa,
}),
)

// set collector limit rules
limitRule := colly.LimitRule{
DomainGlob: "*",
Delay: timings.Delay,
Expand All @@ -30,12 +51,13 @@ func InitializeCollectors(ctx context.Context, engineName engines.Name, options
Msg("_sedefaults.InitializeCollectors(): failed adding new limit rule")
}

// set collector proxies
if settings.Proxies != nil {
log.Debug().
Strs("proxies", settings.Proxies).
Msg("Using proxies")

// Rotate proxies
// rotate proxies
rp, err := proxy.RoundRobinProxySwitcher(settings.Proxies...)
if err != nil {
log.Fatal().
Expand All @@ -48,11 +70,11 @@ func InitializeCollectors(ctx context.Context, engineName engines.Name, options
pagesCol.SetProxyFunc(rp)
}

// Set up collector
// set up collector
colRequest(col, ctx, engineName, false)
colError(col, engineName, false)

// Set up pages collector
// set up pages collector
colRequest(pagesCol, ctx, engineName, true)
colError(pagesCol, engineName, true)
pagesColResponse(pagesCol, engineName, relay)
Expand Down
Loading

0 comments on commit 93a4bcb

Please sign in to comment.