From e0eb9f064afee09e5f5720875c99929e13dedb16 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Wed, 12 Jun 2024 16:27:45 +0200 Subject: [PATCH 1/4] initial work to split validation and add specs --- internal/app/event/model.go | 2 +- internal/app/event/repository.go | 7 +- internal/app/event/service.go | 113 ++++++++++-------- internal/app/event/service_test.go | 181 +++++++++++++++++++++++++++-- 4 files changed, 241 insertions(+), 62 deletions(-) diff --git a/internal/app/event/model.go b/internal/app/event/model.go index 8670ae8..905892d 100644 --- a/internal/app/event/model.go +++ b/internal/app/event/model.go @@ -20,4 +20,4 @@ type Event struct { Useragent string `gorm:"-:all" json:"useragent"` } -const TABLE_OPTIONS = "ENGINE=MergeTree PARTITION BY toYYYYMM(timestamp) ORDER BY (repo_id, toDate(timestamp), user_id) SAMPLE BY user_id" \ No newline at end of file +const TABLE_OPTIONS = "ENGINE=MergeTree PARTITION BY toYYYYMM(timestamp) ORDER BY (repo_id, toDate(timestamp), user_id) SAMPLE BY user_id" diff --git a/internal/app/event/repository.go b/internal/app/event/repository.go index c4e044d..393d2f4 100644 --- a/internal/app/event/repository.go +++ b/internal/app/event/repository.go @@ -21,14 +21,14 @@ type EventRepositoryReader interface { // type EventRepository struct { - db *gorm.DB - config *app.Config + db *gorm.DB + config *app.Config } // NewRepository creates a new event repository func NewEventRepository(db *gorm.DB, config *app.Config) *EventRepository { return &EventRepository{ - db: db, + db: db, config: config, } } @@ -38,7 +38,6 @@ func (repository *EventRepository) Create(event *Event) error { return repository.db.Create(event).Error } - // // Plausible implementation of the event repository // diff --git a/internal/app/event/service.go b/internal/app/event/service.go index 05c450e..a0114fd 100644 --- a/internal/app/event/service.go +++ b/internal/app/event/service.go @@ -4,7 +4,7 @@ import ( "encoding/json" "errors" "fmt" - "io/ioutil" + "io" "net/http" "net/url" "strings" @@ -16,17 +16,17 @@ import ( type EventService struct { eventRepository EventRepositoryReader - sessionService *session.SessionService - config *app.Config + sessionService *session.SessionService + config *app.Config } type EventRequest struct { - Name string `json:"name"` - RepoId string `json:"repoId"` - Url string `json:"url"` - Useragent string `json:"useragent"` - ClientIp string `json:"clientIp"` - Pid string `json:"pid"` + Name string `json:"name"` + RepoId string `json:"repoId"` + Url string `json:"url"` + Useragent string `json:"useragent"` + ClientIp string `json:"clientIp"` + Pid string `json:"pid"` } // NewEventService creates a new event service @@ -34,7 +34,7 @@ func NewEventService(repository EventRepositoryReader, sessionService *session.S return &EventService{ eventRepository: repository, sessionService: sessionService, - config: config, + config: config, } } @@ -50,10 +50,10 @@ func (service *EventService) CreateEvent(eventRequest *EventRequest) (Event, err // Get hostname from the url url, err := url.Parse(eventRequest.Url) - if err != nil { - return Event{}, err - } - hostDomain := strings.TrimPrefix(url.Hostname(), "www.") + if err != nil { + return Event{}, err + } + hostDomain := strings.TrimPrefix(url.Hostname(), "www.") // User id is generate conforming to COUNTER rules // It's a cryptographic hash of details with a daily salt. @@ -94,65 +94,84 @@ func (service *EventService) CreateRaw(event Event) (Event, error) { } func (service *EventService) Validate(eventRequest *EventRequest) error { - // Http client - client := &http.Client{} + var err error - // Validate PID when server is set to validate and is a view event - if service.config.ValidateDoi && eventRequest.Name == "view" { - return checkDoiExistsInDataCite(eventRequest.Pid, eventRequest.Url, service.config.DataCite.Url, client); + if eventRequest.Name != "view" { + return err } - return nil + if !service.shouldValidate() { + return err + } + + resp, err := getDoi(eventRequest.Pid, service.config.DataCite.Url) + + if err != nil { + return fmt.Errorf("failed to make request: %v", err) + } + + defer resp.Body.Close() + + if service.config.Validate.DoiExistence { + err = checkDoiExistence(resp) + } + + if err != nil { + return err + } + + if service.config.Validate.DoiUrl { + err = checkDoiUrl(resp, eventRequest.Url) + } + + return err } -func checkDoiExistsInDataCite(doi string, url string, dataciteApiUrl string, client *http.Client) error { - // Make API call to DataCite for DOI +func (service *EventService) shouldValidate() bool { + return service.config.Validate.DoiExistence && service.config.Validate.DoiUrl +} - // URL to send the metric to +func getDoi(doi string, dataciteApiUrl string) (*http.Response, error) { apiUrl := fmt.Sprintf("%s/dois/%s/get-url", dataciteApiUrl, doi) - // Post json to url - resp, _ := http.Get(apiUrl) + return http.Get(apiUrl) +} +func checkDoiExistence(resp *http.Response) error { if resp.StatusCode == 404 { - return errors.New("This DOI doesn't exist in DataCite") + return errors.New("this DOI doesn't exist in DataCite") } - // Close response - defer resp.Body.Close() + return nil +} - // Get Json result - body, _ := ioutil.ReadAll(resp.Body) +func checkDoiUrl(resp *http.Response, url string) error { + body, err := io.ReadAll(resp.Body) - type GetUrlResponse struct { + if err != nil { + return fmt.Errorf("failed to read body: %v", err) + } + + var result struct { Url string `json:"url"` } - var result GetUrlResponse - if err := json.Unmarshal(body, &result); err != nil { - return errors.New("Can not unmarshal JSON") + err = json.Unmarshal(body, &result) + + if err != nil { + return errors.New("can not unmarshal JSON") } - // Compare the result with the url but ignore the protocol if !validateDoiUrl(result.Url, url) { - return errors.New("This DOI doesn't match this URL") + return errors.New("this DOI doesn't match this URL") } return nil } func validateDoiUrl(doiUrl string, urlCompare string) bool { - // Print stripScheme(doiUrl) - fmt.Println(stripScheme(doiUrl)) - // Print stripScheme(urlCompare) - fmt.Println(stripScheme(urlCompare)) - // Compare the result with the url but ignore the protocol - if stripScheme(doiUrl) != stripScheme(urlCompare) { - return false - } - - return true + return stripScheme(doiUrl) == stripScheme(urlCompare) } // Function to strip the scheme from a URL @@ -184,4 +203,4 @@ func CreateMockEvent(metricName string, repoId string, doi string, userId uint64 Pid: doi, Timestamp: timestamp, } -} \ No newline at end of file +} diff --git a/internal/app/event/service_test.go b/internal/app/event/service_test.go index fe8ff5d..67e6cf6 100644 --- a/internal/app/event/service_test.go +++ b/internal/app/event/service_test.go @@ -1,26 +1,187 @@ package event import ( - "net/http" + "fmt" "testing" + + "github.com/datacite/keeshond/internal/app" ) +func buildEventService(dataCiteUrl string, validateDoiExistence bool, validateDoiUrl bool) *EventService { + config := &app.Config{ + DataCite: struct { + Url string + JWT string + JWTPublicKey string + }{ + Url: dataCiteUrl, + }, + Validate: struct { + DoiExistence bool + DoiUrl bool + }{ + DoiExistence: validateDoiExistence, + DoiUrl: validateDoiUrl, + }, + } + + eventService := &EventService{ + config: config, + } + + return eventService +} + func TestValidateDoiUrl(t *testing.T) { - if(!validateDoiUrl("http://www.example.com/url/?foo=bar&foo=baz#this_is_fragment", "https://www.example.com/url?foo=bar&foo=baz#this_is_fragment")) { + if !validateDoiUrl("http://www.example.com/url/?foo=bar&foo=baz#this_is_fragment", "https://www.example.com/url?foo=bar&foo=baz#this_is_fragment") { t.Errorf("validateDoiUrl should return false") } } -// Test checkDoiExistsInDataCite -func TestCheckDoiExistsInDataCite(t *testing.T) { - client := &http.Client{} +// Test Validate +func TestValidateSuccessWithBothDoiExistenceCheckAndDoiUrlCheck(t *testing.T) { + eventService := buildEventService("https://api.stage.datacite.org", true, true) + + eventRequest := &EventRequest{ + Name: "view", + Pid: "10.70102/mdc.jeopardy", + Url: "https://demorepo.stage.datacite.org/datasets/10.70102/mdc.jeopardy", + } + + err := eventService.Validate(eventRequest) + + if err != nil { + t.Errorf("Validate should return nil") + } +} + +func TestValidateSuccessWithOnlyDoiExistenceCheck(t *testing.T) { + eventService := buildEventService("https://api.stage.datacite.org", true, false) - // Check if the DOI exists in DataCite - err := checkDoiExistsInDataCite("10.70102/mdc.jeopardy", "https://demorepo.stage.datacite.org/datasets/10.70102/mdc.jeopardy", "https://api.stage.datacite.org", client) + eventRequest := &EventRequest{ + Name: "view", + Pid: "10.70102/mdc.jeopardy", + // Set Url to a value that should fail. + // Since we are skipping the Url check, this test should pass i.e. the existence check should be successful. + Url: "https://demorepo.stage.datacite.org/datasets/10.70102/mdc.jeopardy.no.bueno", + } + err := eventService.Validate(eventRequest) - // Check if the error is nil if err != nil { - t.Errorf("checkDoiExistsInDataCite should return nil") + t.Errorf("Validate should return nil") } -} \ No newline at end of file +} + +func TestValidateSuccessWithOnlyDoiUrlCheck(t *testing.T) { + eventService := buildEventService("https://api.stage.datacite.org", false, true) + + eventRequest := &EventRequest{ + Name: "view", + Pid: "10.70102/mdc.jeopardy", + Url: "https://demorepo.stage.datacite.org/datasets/10.70102/mdc.jeopardy", + } + + err := eventService.Validate(eventRequest) + + if err != nil { + t.Errorf("Validate should return nil") + } +} + +func TestValidateSuccessWithNeitherDoiExistenceCheckAndDoiUrlCheck(t *testing.T) { + eventService := buildEventService("https://api.stage.datacite.org", false, false) + + eventRequest := &EventRequest{ + Name: "view", + Pid: "10.70102/mdc.jeopardy", + Url: "https://demorepo.stage.datacite.org/datasets/10.70102/mdc.jeopardy", + } + + err := eventService.Validate(eventRequest) + + if err != nil { + t.Errorf("Validate should return nil") + } +} + +func TestValidateSuccessWithEventRequestNameNotView(t *testing.T) { + eventService := buildEventService("https://api.stage.datacite.org", true, true) + + eventRequest := &EventRequest{ + Name: "not_view", + // Set PID to a value that would not resolve to a DOI. + Pid: "10.70102/mdc.jeopardy.no.bueno", + // Set URL to a value that would not match the PID. + Url: "https://demorepo.stage.datacite.org/datasets/10.70102/mdc.jeopardy.sin coincidencia", + } + + err := eventService.Validate(eventRequest) + + if err != nil { + t.Errorf("Validate should return nil") + } +} + +func TestValidateFailureWhenDoiDoesNotExist(t *testing.T) { + eventService := buildEventService("https://api.stage.datacite.org", true, true) + + eventRequest := &EventRequest{ + Name: "view", + Pid: "10.70102/mdc.jeopardy.no.bueno", + Url: "https://demorepo.stage.datacite.org/datasets/10.70102/mdc.jeopardy", + } + + err := eventService.Validate(eventRequest) + + const actualErr = "this DOI doesn't exist in DataCite" + + if err == nil { + t.Errorf("Validate should return an error") + } + + if err.Error() != actualErr { + t.Errorf("Validate should return error: %v", actualErr) + } +} + +func TestValidateFailureWhenDoiUrlCheckIsUnsuccessful(t *testing.T) { + eventService := buildEventService("https://api.stage.datacite.org", true, true) + + eventRequest := &EventRequest{ + Name: "view", + Pid: "10.70102/mdc.jeopardy", + Url: "https://demorepo.stage.datacite.org/datasets/10.70102/mdc.jeopardy.no.bueno", + } + + err := eventService.Validate(eventRequest) + + const actualErr = "this DOI doesn't match this URL" + + if err == nil { + t.Errorf("Validate should return an error") + } + + if err.Error() != actualErr { + t.Errorf("Validate should return error: %v", actualErr) + } +} + +func TestValidateFailureWhenCannotAccessDataCiteApi(t *testing.T) { + // We provide an incorrect DataCite URL in order to generate a failed response. + eventService := buildEventService("https://api.stage.datamight.com", true, true) + + eventRequest := &EventRequest{ + Name: "view", + Pid: "10.70102/mdc.jeopardy", + Url: "https://demorepo.stage.datacite.org/datasets/10.70102/mdc.jeopardy", + } + + err := eventService.Validate(eventRequest) + + fmt.Printf("err: %v\n", err) + + if err == nil { + t.Errorf("Validate should return an error") + } +} From 53967623dd1dbd99fb1b2dfe83e5d9060692893c Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Wed, 12 Jun 2024 16:28:21 +0200 Subject: [PATCH 2/4] initial work to split validation and add specs --- cmd/cli/main.go | 18 +++---- cmd/worker/main.go | 6 +-- internal/app/auth/auth.go | 2 +- internal/app/config.go | 19 ++++--- internal/app/db/db.go | 80 ++++++++++++++-------------- internal/app/net/http.go | 35 ++++++------ internal/app/reports/model.go | 52 +++++++++--------- internal/app/reports/service.go | 55 ++++++++++--------- internal/app/reports/service_test.go | 62 +++++++++++---------- internal/app/session/model.go | 8 +-- internal/app/session/repository.go | 6 +-- internal/app/session/service.go | 3 +- internal/app/session/service_test.go | 6 +-- internal/app/stats/model.go | 34 ++++++------ internal/app/stats/service.go | 3 +- internal/app/stats/service_test.go | 12 ++--- 16 files changed, 200 insertions(+), 201 deletions(-) diff --git a/cmd/cli/main.go b/cmd/cli/main.go index 3b4ea5f..ad2a74e 100644 --- a/cmd/cli/main.go +++ b/cmd/cli/main.go @@ -18,7 +18,7 @@ import ( ) func main() { - app := &cli.App{ + app := &cli.App{ Commands: []*cli.Command{ { Name: "event", @@ -95,9 +95,9 @@ func main() { } // Create shared data used for all datasets - sharedData := reports.SharedData { - Platform: platform, - Publisher: publisher, + sharedData := reports.SharedData{ + Platform: platform, + Publisher: publisher, PublisherId: publisherId, } @@ -155,11 +155,11 @@ func main() { }, }, }, - } + } - if err := app.Run(os.Args); err != nil { - log.Fatal(err) - } + if err := app.Run(os.Args); err != nil { + log.Fatal(err) + } } @@ -198,4 +198,4 @@ func migrateDB(conn *gorm.DB) { if err := db.AutoMigrate(conn); err != nil { log.Println(err) } -} \ No newline at end of file +} diff --git a/cmd/worker/main.go b/cmd/worker/main.go index cc96ab8..1b5b7ac 100644 --- a/cmd/worker/main.go +++ b/cmd/worker/main.go @@ -33,9 +33,9 @@ func report_job(repoId string, beginDate time.Time, endDate time.Time, platform reportsService := reports.NewReportsService(statsService) // Create shared data used for all datasets - sharedData := reports.SharedData { - Platform: platform, - Publisher: publisher, + sharedData := reports.SharedData{ + Platform: platform, + Publisher: publisher, PublisherId: publisherId, } diff --git a/internal/app/auth/auth.go b/internal/app/auth/auth.go index a38f7d7..93c7e5c 100644 --- a/internal/app/auth/auth.go +++ b/internal/app/auth/auth.go @@ -24,4 +24,4 @@ func GetAuthToken(config *app.Config) *jwtauth.JWTAuth { // Private key is nil because we are only using the public key to verify the token. return jwtauth.New("RS256", nil, publicKey) -} \ No newline at end of file +} diff --git a/internal/app/config.go b/internal/app/config.go index 9814e88..6ebc685 100644 --- a/internal/app/config.go +++ b/internal/app/config.go @@ -11,12 +11,12 @@ type Config struct { } AnalyticsDatabase struct { - Host string - Port string - User string - Dbname string + Host string + Port string + User string + Dbname string Password string - Sslmode string + Sslmode string } Plausible struct { @@ -24,11 +24,16 @@ type Config struct { } DataCite struct { - Url string - JWT string + Url string + JWT string JWTPublicKey string } + Validate struct { + DoiExistence bool + DoiUrl bool + } + ValidateDoi bool } diff --git a/internal/app/db/db.go b/internal/app/db/db.go index f478ce8..f57c75f 100644 --- a/internal/app/db/db.go +++ b/internal/app/db/db.go @@ -16,62 +16,62 @@ import ( // Format a clickhouse dsn from seperate config fields func CreateClickhouseDSN(host, port, user, password, dbname string) string { - return fmt.Sprintf("clickhouse://%s:%s@%s:%s/%s", user, password, host, port, dbname) + return fmt.Sprintf("clickhouse://%s:%s@%s:%s/%s", user, password, host, port, dbname) } func NewGormClickhouseConnection(dsn string) (*gorm.DB, error) { - // Setup a custom logger for gorm - newLogger := logger.New( - log.New(os.Stdout, "\r\n", log.LstdFlags), // io writer - logger.Config{ - SlowThreshold: time.Second, // Slow SQL threshold - LogLevel: logger.Error, // Log level - IgnoreRecordNotFoundError: true, // Ignore ErrRecordNotFound error for logger - Colorful: false, // Disable color - }, - ) + // Setup a custom logger for gorm + newLogger := logger.New( + log.New(os.Stdout, "\r\n", log.LstdFlags), // io writer + logger.Config{ + SlowThreshold: time.Second, // Slow SQL threshold + LogLevel: logger.Error, // Log level + IgnoreRecordNotFoundError: true, // Ignore ErrRecordNotFound error for logger + Colorful: false, // Disable color + }, + ) - // Open the connection with the custom logger - db, err := gorm.Open(clickhouse.Open(dsn), &gorm.Config{ - Logger: newLogger, - }) + // Open the connection with the custom logger + db, err := gorm.Open(clickhouse.Open(dsn), &gorm.Config{ + Logger: newLogger, + }) - if err != nil { - return db, err - } + if err != nil { + return db, err + } - // Gives us special access to do a common table expression with gorm i.e. "with" - db.Use(extraClausePlugin.New()) + // Gives us special access to do a common table expression with gorm i.e. "with" + db.Use(extraClausePlugin.New()) - return db, nil + return db, nil } // Test if the database connection is working func TestConnection(db *gorm.DB) error { - sqlDB, err := db.DB() - if err != nil { - return err - } - err = sqlDB.Ping() - if err != nil { - return err - } - return nil + sqlDB, err := db.DB() + if err != nil { + return err + } + err = sqlDB.Ping() + if err != nil { + return err + } + return nil } // Migrate models -func AutoMigrate(db *gorm.DB) error{ - var err error +func AutoMigrate(db *gorm.DB) error { + var err error - err = db.Set("gorm:table_options", event.TABLE_OPTIONS).AutoMigrate(&event.Event{}) + err = db.Set("gorm:table_options", event.TABLE_OPTIONS).AutoMigrate(&event.Event{}) - if err != nil { - return err - } + if err != nil { + return err + } - err = db.AutoMigrate ( - &session.Salt{}, + err = db.AutoMigrate( + &session.Salt{}, ) - return err -} \ No newline at end of file + return err +} diff --git a/internal/app/net/http.go b/internal/app/net/http.go index 5085670..0e74294 100644 --- a/internal/app/net/http.go +++ b/internal/app/net/http.go @@ -23,13 +23,13 @@ import ( ) type Http struct { - server *http.Server - router *chi.Mux - config *app.Config - db *gorm.DB + server *http.Server + router *chi.Mux + config *app.Config + db *gorm.DB tokenAuth *jwtauth.JWTAuth - eventServiceDB *event.EventService + eventServiceDB *event.EventService statsService *stats.StatsService } @@ -44,10 +44,10 @@ func NewHttpServer(config *app.Config, db *gorm.DB) *Http { // Create a new server that wraps the net/http server & add a router. s := &Http{ - server: &http.Server{}, - router: chi.NewRouter(), - config: config, - db: db, + server: &http.Server{}, + router: chi.NewRouter(), + config: config, + db: db, tokenAuth: tokenAuth, } @@ -145,7 +145,6 @@ func (s *Http) check(w http.ResponseWriter, r *http.Request) { w.Write([]byte(result.Timestamp.Format("2006-01-02T15:04:05Z"))) } - // Function to check if useragent is a bot func isBot(userAgent string) bool { // Read file with known bots @@ -162,7 +161,7 @@ func isBot(userAgent string) bool { // Loop through list of bots and check if useragent matches pattern for _, bot := range botsList { pattern := bot["pattern"].(string) - regex := regexp.MustCompile("(?i)"+pattern) + regex := regexp.MustCompile("(?i)" + pattern) if regex.MatchString(userAgent) { return true @@ -247,8 +246,8 @@ func (s *Http) getAggregate(w http.ResponseWriter, r *http.Request) { } query := stats.Query{ - Start: startDate, - End: endDate, + Start: startDate, + End: endDate, } // Get total views for a repository in query period @@ -279,8 +278,8 @@ func (s *Http) getTimeseries(w http.ResponseWriter, r *http.Request) { } query := stats.Query{ - Start: startDate, - End: endDate, + Start: startDate, + End: endDate, Interval: interval, } @@ -321,8 +320,8 @@ func (s *Http) getBreakdown(w http.ResponseWriter, r *http.Request) { } query := stats.Query{ - Start: startDate, - End: endDate, + Start: startDate, + End: endDate, } // Get total views for a repository based on query @@ -337,4 +336,4 @@ func (s *Http) getBreakdown(w http.ResponseWriter, r *http.Request) { // Serialise results but put inside a json object json.NewEncoder(w).Encode(data) -} \ No newline at end of file +} diff --git a/internal/app/reports/model.go b/internal/app/reports/model.go index 805ea14..f6cae97 100644 --- a/internal/app/reports/model.go +++ b/internal/app/reports/model.go @@ -8,54 +8,54 @@ type ReportingPeriod struct { } type Exception struct { - Code int `json:"code"` + Code int `json:"code"` Severity string `json:"severity"` - Message string `json:"message"` - HelpUrl string `json:"help-url"` - Data string `json:"data"` + Message string `json:"message"` + HelpUrl string `json:"help-url"` + Data string `json:"data"` } type CounterIdentifier struct { - Type string `json:"type"` + Type string `json:"type"` Value string `json:"value"` } type CounterDatasetInstance struct { - MetricType string `json:"metric-type"` - Count int `json:"count"` + MetricType string `json:"metric-type"` + Count int `json:"count"` AccessMethod string `json:"access-method"` } type CounterDatasetPerformance struct { Instance []CounterDatasetInstance `json:"instance"` - Period ReportingPeriod `json:"period"` + Period ReportingPeriod `json:"period"` } // SUSHI report header struct type ReportHeader struct { - ReportName string `json:"report-name"` - ReportId string `json:"report-id"` - Release string `json:"release"` - Created string `json:"created"` - CreatedBy string `json:"created-by"` - ReportingPeriod ReportingPeriod `json:"reporting-period"` - ReportFilters []string `json:"report-filters"` - ReportAttributes []string `json:"report-attributes"` - Exceptions []Exception `json:"exceptions"` + ReportName string `json:"report-name"` + ReportId string `json:"report-id"` + Release string `json:"release"` + Created string `json:"created"` + CreatedBy string `json:"created-by"` + ReportingPeriod ReportingPeriod `json:"reporting-period"` + ReportFilters []string `json:"report-filters"` + ReportAttributes []string `json:"report-attributes"` + Exceptions []Exception `json:"exceptions"` } // COUNTER report dataset usage struct type CounterDatasetUsage struct { - DatasetTitle string `json:"dataset-title"` - DatasetId []CounterIdentifier `json:"dataset-id"` - Platform string `json:"platform"` - Publisher string `json:"publisher"` - PublisherId []CounterIdentifier `json:"publisher-id"` - DataType string `json:"data-type"` - Performance []CounterDatasetPerformance `json:"performance"` + DatasetTitle string `json:"dataset-title"` + DatasetId []CounterIdentifier `json:"dataset-id"` + Platform string `json:"platform"` + Publisher string `json:"publisher"` + PublisherId []CounterIdentifier `json:"publisher-id"` + DataType string `json:"data-type"` + Performance []CounterDatasetPerformance `json:"performance"` } type CounterDatasetReport struct { - ReportHeader ReportHeader `json:"report-header"` + ReportHeader ReportHeader `json:"report-header"` ReportDatasets []CounterDatasetUsage `json:"report-datasets"` -} \ No newline at end of file +} diff --git a/internal/app/reports/service.go b/internal/app/reports/service.go index b2f8dd6..d039c8b 100644 --- a/internal/app/reports/service.go +++ b/internal/app/reports/service.go @@ -117,11 +117,11 @@ func (service *ReportsService) GenerateDatasetUsageReport(repoId string, startDa if addCompressedHeader { // Add exception that this will be compressed report exceptions = append(exceptions, Exception{ - Code: 69, - Message: "Report is compressed using gzip", - Severity: "warning", - HelpUrl: "https://github.com/datacite/sashimi", - Data: "usage data needs to be uncompressed", + Code: 69, + Message: "Report is compressed using gzip", + Severity: "warning", + HelpUrl: "https://github.com/datacite/sashimi", + Data: "usage data needs to be uncompressed", }) } @@ -180,7 +180,7 @@ func generateDatasetUsage(beginDate time.Time, endDate time.Time, result stats.B datasetUsage.Publisher = sharedData.Publisher if sharedData.PublisherId != "" { - datasetUsage.PublisherId = []CounterIdentifier{ { + datasetUsage.PublisherId = []CounterIdentifier{{ Type: "client-id", Value: sharedData.PublisherId, }} @@ -197,23 +197,23 @@ func generateDatasetUsage(beginDate time.Time, endDate time.Time, result stats.B }, Instance: []CounterDatasetInstance{ { - MetricType: "total-dataset-requests", - Count: int(result.TotalDownloads), + MetricType: "total-dataset-requests", + Count: int(result.TotalDownloads), AccessMethod: "regular", }, { - MetricType: "unique-dataset-requests", - Count: int(result.UniqueDownloads), + MetricType: "unique-dataset-requests", + Count: int(result.UniqueDownloads), AccessMethod: "regular", }, { - MetricType: "total-dataset-investigations", - Count: int(result.TotalViews), + MetricType: "total-dataset-investigations", + Count: int(result.TotalViews), AccessMethod: "regular", }, { - MetricType: "unique-dataset-investigations", - Count: int(result.UniqueViews), + MetricType: "unique-dataset-investigations", + Count: int(result.UniqueViews), AccessMethod: "regular", }, }, @@ -237,7 +237,7 @@ func SendReportToAPI(reportsAPIEndpoint string, compressedJson []byte, jwt strin req.Header.Set("Content-Encoding", "gzip") // Add JWT token to request - req.Header.Set("Authorization", "Bearer " + jwt) + req.Header.Set("Authorization", "Bearer "+jwt) client := http.Client{ Timeout: 100 * time.Second, @@ -253,20 +253,19 @@ func SendReportToAPI(reportsAPIEndpoint string, compressedJson []byte, jwt strin // Check response code switch res.StatusCode { - case http.StatusCreated: - log.Default().Println("Report sent to Reports API") - case http.StatusUnauthorized: - return errors.New("unauthorized, JWT token is missing or invalid") - case http.StatusForbidden: - return errors.New("forbidden, JWT is expired or invalid") - case http.StatusUnsupportedMediaType: - return errors.New("did not include correct Content-Type header") - case http.StatusUnprocessableEntity: - return errors.New("invalid report provided") - default: - return errors.New("Error sending report to Reports API: " + res.Status) + case http.StatusCreated: + log.Default().Println("Report sent to Reports API") + case http.StatusUnauthorized: + return errors.New("unauthorized, JWT token is missing or invalid") + case http.StatusForbidden: + return errors.New("forbidden, JWT is expired or invalid") + case http.StatusUnsupportedMediaType: + return errors.New("did not include correct Content-Type header") + case http.StatusUnprocessableEntity: + return errors.New("invalid report provided") + default: + return errors.New("Error sending report to Reports API: " + res.Status) } return nil } - diff --git a/internal/app/reports/service_test.go b/internal/app/reports/service_test.go index d91c2d2..a1ced86 100644 --- a/internal/app/reports/service_test.go +++ b/internal/app/reports/service_test.go @@ -9,11 +9,9 @@ import ( ) type MockReportsRepositoryReader struct { - } type MockStatsService struct { - } // Mock breakdown @@ -22,34 +20,34 @@ func (m *MockStatsService) BreakdownByPID(repoId string, query stats.Query, page if page == 1 { return []stats.BreakdownResult{ { - Pid: "10.1234/1", - TotalViews: 100, - UniqueViews: 50, - TotalDownloads: 50, + Pid: "10.1234/1", + TotalViews: 100, + UniqueViews: 50, + TotalDownloads: 50, UniqueDownloads: 25, }, { - Pid: "10.1234/2", - TotalViews: 100, - UniqueViews: 50, - TotalDownloads: 50, + Pid: "10.1234/2", + TotalViews: 100, + UniqueViews: 50, + TotalDownloads: 50, UniqueDownloads: 25, }, } } else if page == 2 { return []stats.BreakdownResult{ { - Pid: "10.1234/3", - TotalViews: 100, - UniqueViews: 50, - TotalDownloads: 50, + Pid: "10.1234/3", + TotalViews: 100, + UniqueViews: 50, + TotalDownloads: 50, UniqueDownloads: 25, }, { - Pid: "10.1234/4", - TotalViews: 100, - UniqueViews: 50, - TotalDownloads: 50, + Pid: "10.1234/4", + TotalViews: 100, + UniqueViews: 50, + TotalDownloads: 50, UniqueDownloads: 25, }, } @@ -67,17 +65,17 @@ func (m *MockStatsService) CountUniquePID(repoId string, query stats.Query) int6 func (m *MockStatsService) Timeseries(repoId string, query stats.Query) []stats.TimeseriesResult { return []stats.TimeseriesResult{ { - Date: time.Date(2018, 1, 1, 0, 0, 0, 0, time.UTC), - TotalViews: 100, - UniqueViews: 50, - TotalDownloads: 50, + Date: time.Date(2018, 1, 1, 0, 0, 0, 0, time.UTC), + TotalViews: 100, + UniqueViews: 50, + TotalDownloads: 50, UniqueDownloads: 25, }, { - Date: time.Date(2018, 1, 2, 0, 0, 0, 0, time.UTC), - TotalViews: 200, - UniqueViews: 100, - TotalDownloads: 100, + Date: time.Date(2018, 1, 2, 0, 0, 0, 0, time.UTC), + TotalViews: 200, + UniqueViews: 100, + TotalDownloads: 100, UniqueDownloads: 50, }, } @@ -86,9 +84,9 @@ func (m *MockStatsService) Timeseries(repoId string, query stats.Query) []stats. // Mock aggregate func (m *MockStatsService) Aggregate(repoId string, query stats.Query) stats.AggregateResult { return stats.AggregateResult{ - TotalViews: 100, - UniqueViews: 50, - TotalDownloads: 50, + TotalViews: 100, + UniqueViews: 50, + TotalDownloads: 50, UniqueDownloads: 25, } } @@ -112,9 +110,9 @@ func TestGenerateDatasetUsageReport(t *testing.T) { endDate := time.Date(2018, 12, 31, 0, 0, 0, 0, time.UTC) // Create fake shared data - sharedData := SharedData { - Platform: "datacite", - Publisher: "datacite", + sharedData := SharedData{ + Platform: "datacite", + Publisher: "datacite", PublisherId: "datacite.test", } diff --git a/internal/app/session/model.go b/internal/app/session/model.go index eee79bc..8cc21f7 100644 --- a/internal/app/session/model.go +++ b/internal/app/session/model.go @@ -5,7 +5,7 @@ import ( ) type Salt struct { - ID uint `gorm:"primary key;autoIncrement"` - Salt []byte - Created time.Time -} \ No newline at end of file + ID uint `gorm:"primary key;autoIncrement"` + Salt []byte + Created time.Time +} diff --git a/internal/app/session/repository.go b/internal/app/session/repository.go index 74aa515..94bf687 100644 --- a/internal/app/session/repository.go +++ b/internal/app/session/repository.go @@ -11,13 +11,13 @@ type SessionRepositoryReader interface { } type SessionRepository struct { - db *gorm.DB - config *app.Config + db *gorm.DB + config *app.Config } func NewSessionRepository(db *gorm.DB, config *app.Config) *SessionRepository { return &SessionRepository{ - db: db, + db: db, config: config, } } diff --git a/internal/app/session/service.go b/internal/app/session/service.go index 5f69040..e051ff2 100644 --- a/internal/app/session/service.go +++ b/internal/app/session/service.go @@ -48,7 +48,7 @@ func generateSalt() (Salt, error) { // Create the salt salt := Salt{ - Salt: saltBytes, + Salt: saltBytes, Created: time.Now(), } @@ -103,7 +103,6 @@ func GenerateSessionId(user_id uint64, time time.Time) uint64 { return h.Sum64() } - func GenerateUserId(salt *Salt, client_ip string, user_agent string, repo_id string, host_domain string) uint64 { // Build a salted integer user id // User_id is based upon a daily salt, the ip from the client, diff --git a/internal/app/session/service_test.go b/internal/app/session/service_test.go index 29dbd81..ed752c8 100644 --- a/internal/app/session/service_test.go +++ b/internal/app/session/service_test.go @@ -7,8 +7,8 @@ import ( func TestGenerateUserId(t *testing.T) { // Create fake salt - salt := Salt { - Salt: []byte{0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16}, + salt := Salt{ + Salt: []byte{0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16}, Created: time.Now(), } @@ -42,4 +42,4 @@ func TestGenerateSessionId(t *testing.T) { if sessionId != expected { t.Fatalf(`Session id is not %d`, expected) } -} \ No newline at end of file +} diff --git a/internal/app/stats/model.go b/internal/app/stats/model.go index 7c76033..493a163 100644 --- a/internal/app/stats/model.go +++ b/internal/app/stats/model.go @@ -3,30 +3,30 @@ package stats import "time" type AggregateResult struct { - TotalViews int64 `json:"total_views"` - UniqueViews int64 `json:"unique_views"` - TotalDownloads int64 `json:"total_downloads"` + TotalViews int64 `json:"total_views"` + UniqueViews int64 `json:"unique_views"` + TotalDownloads int64 `json:"total_downloads"` UniqueDownloads int64 `json:"unique_downloads"` } type TimeseriesResult struct { - Date time.Time `json:"date"` - TotalViews int64 `json:"total_views"` - UniqueViews int64 `json:"unique_views"` - TotalDownloads int64 `json:"total_downloads"` - UniqueDownloads int64 `json:"unique_downloads"` + Date time.Time `json:"date"` + TotalViews int64 `json:"total_views"` + UniqueViews int64 `json:"unique_views"` + TotalDownloads int64 `json:"total_downloads"` + UniqueDownloads int64 `json:"unique_downloads"` } type BreakdownResult struct { - Pid string `json:"pid"` - TotalViews int64 `json:"total_views"` - UniqueViews int64 `json:"unique_views"` - TotalDownloads int64 `json:"total_downloads"` - UniqueDownloads int64 `json:"unique_downloads"` + Pid string `json:"pid"` + TotalViews int64 `json:"total_views"` + UniqueViews int64 `json:"unique_views"` + TotalDownloads int64 `json:"total_downloads"` + UniqueDownloads int64 `json:"unique_downloads"` } type Query struct { - Start time.Time // Beginning of the query period - End time.Time // End of the query period - Interval string // Interval to break the results into e.g. "day", "month", "hour" -} \ No newline at end of file + Start time.Time // Beginning of the query period + End time.Time // End of the query period + Interval string // Interval to break the results into e.g. "day", "month", "hour" +} diff --git a/internal/app/stats/service.go b/internal/app/stats/service.go index c235333..8e3cd83 100644 --- a/internal/app/stats/service.go +++ b/internal/app/stats/service.go @@ -47,7 +47,6 @@ func (service *StatsService) LastEvent(repoId string) (event.Event, bool) { return service.repository.LastEvent(repoId) } - // Function to parse a period string into start and end time ranges relative to date func ParsePeriodString(period string, date string) (time.Time, time.Time, error) { // Set default start and end times @@ -106,4 +105,4 @@ func ParsePeriodString(period string, date string) (time.Time, time.Time, error) endTime = endTime.AddDate(0, 0, 1) return startTime, endTime, nil -} \ No newline at end of file +} diff --git a/internal/app/stats/service_test.go b/internal/app/stats/service_test.go index da00730..c14ce13 100644 --- a/internal/app/stats/service_test.go +++ b/internal/app/stats/service_test.go @@ -144,8 +144,8 @@ func TestStatsService_Aggregate(t *testing.T) { // Construct query query := Query{ - Start: start, - End: end, + Start: start, + End: end, } statsRepository := NewStatsRepository(conn) @@ -250,8 +250,8 @@ func TestStatsService_BreakdownByPID(t *testing.T) { // Construct query for timeseries by hour query := Query{ - Start: start, - End: end, + Start: start, + End: end, } // Get stats @@ -285,8 +285,8 @@ func TestStatsService_CountUniquePID(t *testing.T) { // Construct query for timeseries by hour query := Query{ - Start: start, - End: end, + Start: start, + End: end, } // Get stats From 0cc78b85f11d4c78e8e86f0c729969e52847150d Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Wed, 19 Jun 2024 17:18:15 +0200 Subject: [PATCH 3/4] rework --- cmd/cli/main.go | 3 ++- internal/app/config.go | 8 ++++---- internal/app/event/service.go | 14 +++++++------- internal/app/stats/service_test.go | 18 ++++++++++++------ readme.md | 8 +++++++- 5 files changed, 32 insertions(+), 19 deletions(-) diff --git a/cmd/cli/main.go b/cmd/cli/main.go index ad2a74e..c3b5500 100644 --- a/cmd/cli/main.go +++ b/cmd/cli/main.go @@ -32,7 +32,8 @@ func main() { // Get configuration from environment variables. var config = app.GetConfigFromEnv() - config.ValidateDoi = false + config.Validate.DoiExistence = false + config.Validate.DoiUrl = false // Setup database connection conn := createDB(config) diff --git a/internal/app/config.go b/internal/app/config.go index 6ebc685..28591f9 100644 --- a/internal/app/config.go +++ b/internal/app/config.go @@ -2,6 +2,7 @@ package app import ( "os" + "strconv" "strings" ) @@ -33,8 +34,6 @@ type Config struct { DoiExistence bool DoiUrl bool } - - ValidateDoi bool } func getEnv(key, fallback string) string { @@ -54,14 +53,15 @@ func GetConfigFromEnv() *Config { config.DataCite.JWTPublicKey = strings.Replace(getEnv("JWT_PUBLIC_KEY", ""), `\n`, "\n", -1) // Database config - config.AnalyticsDatabase.Host = getEnv("ANALYTICS_DATABASE_HOST", "localhost") + config.AnalyticsDatabase.Host = getEnv("ANALYTICS_DATABASE_HOST", "wendels-mbp.lan") config.AnalyticsDatabase.Port = getEnv("ANALYTICS_DATABASE_PORT", "9000") config.AnalyticsDatabase.User = getEnv("ANALYTICS_DATABASE_USER", "keeshond") config.AnalyticsDatabase.Dbname = getEnv("ANALYTICS_DATABASE_DBNAME", "keeshond") config.AnalyticsDatabase.Password = getEnv("ANALYTICS_DATABASE_PASSWORD", "keeshond") // Validate DOI - config.ValidateDoi = getEnv("VALIDATE_DOI", "true") == "true" + config.Validate.DoiExistence, _ = strconv.ParseBool(getEnv("VALIDATE_DOI_EXISTENCE", "true")) + config.Validate.DoiUrl, _ = strconv.ParseBool(getEnv("VALIDATE_DOI_URL", "false")) return &config } diff --git a/internal/app/event/service.go b/internal/app/event/service.go index a0114fd..7f9e2a0 100644 --- a/internal/app/event/service.go +++ b/internal/app/event/service.go @@ -96,11 +96,7 @@ func (service *EventService) CreateRaw(event Event) (Event, error) { func (service *EventService) Validate(eventRequest *EventRequest) error { var err error - if eventRequest.Name != "view" { - return err - } - - if !service.shouldValidate() { + if !shouldValidate(service, eventRequest) { return err } @@ -127,8 +123,12 @@ func (service *EventService) Validate(eventRequest *EventRequest) error { return err } -func (service *EventService) shouldValidate() bool { - return service.config.Validate.DoiExistence && service.config.Validate.DoiUrl +func shouldValidate(service *EventService, eventRequest *EventRequest) bool { + if eventRequest.Name != "view" { + return false + } + + return service.config.Validate.DoiExistence || service.config.Validate.DoiUrl } func getDoi(doi string, dataciteApiUrl string) (*http.Response, error) { diff --git a/internal/app/stats/service_test.go b/internal/app/stats/service_test.go index c14ce13..2369755 100644 --- a/internal/app/stats/service_test.go +++ b/internal/app/stats/service_test.go @@ -75,7 +75,8 @@ func setupTestDB(config *app.Config) (*gorm.DB, error) { func setup() TestState { // Test config config := app.GetConfigFromEnv() - config.ValidateDoi = false + config.Validate.DoiExistence = false + config.Validate.DoiUrl = false config.AnalyticsDatabase.Dbname = "keeshond_test" conn, err := setupTestDB(config) @@ -127,7 +128,8 @@ func TestMain(m *testing.M) { func TestStatsService_Aggregate(t *testing.T) { // Test config config := app.GetConfigFromEnv() - config.ValidateDoi = false + config.Validate.DoiExistence = false + config.Validate.DoiUrl = false config.AnalyticsDatabase.Dbname = "keeshond_test" conn, err := setupTestDB(config) @@ -174,7 +176,8 @@ func TestStatsService_Aggregate(t *testing.T) { func TestStatsService_Timeseries(t *testing.T) { // Test config config := app.GetConfigFromEnv() - config.ValidateDoi = false + config.Validate.DoiExistence = false + config.Validate.DoiUrl = false config.AnalyticsDatabase.Dbname = "keeshond_test" conn, err := setupTestDB(config) @@ -230,7 +233,8 @@ func TestStatsService_Timeseries(t *testing.T) { func TestStatsService_BreakdownByPID(t *testing.T) { // Test config config := app.GetConfigFromEnv() - config.ValidateDoi = false + config.Validate.DoiExistence = false + config.Validate.DoiUrl = false config.AnalyticsDatabase.Dbname = "keeshond_test" conn, err := setupTestDB(config) @@ -265,7 +269,8 @@ func TestStatsService_BreakdownByPID(t *testing.T) { func TestStatsService_CountUniquePID(t *testing.T) { // Test config config := app.GetConfigFromEnv() - config.ValidateDoi = false + config.Validate.DoiExistence = false + config.Validate.DoiUrl = false config.AnalyticsDatabase.Dbname = "keeshond_test" conn, err := setupTestDB(config) @@ -300,7 +305,8 @@ func TestStatsService_CountUniquePID(t *testing.T) { func TestStatsService_LastEvent(t *testing.T) { // Test config config := app.GetConfigFromEnv() - config.ValidateDoi = false + config.Validate.DoiExistence = false + config.Validate.DoiUrl = false config.AnalyticsDatabase.Dbname = "keeshond_test" conn, err := setupTestDB(config) diff --git a/readme.md b/readme.md index 6b383a2..2d210a5 100644 --- a/readme.md +++ b/readme.md @@ -39,7 +39,8 @@ Configuration is taken from the environment ### Web tracking Config -- VALIDATE_DOI - Can enable/disable DOI validation for event tracking - default to true. +- VALIDATE_DOI_EXISTENCE - Can enable/disable DOI existence validation for event tracking - default to true. +- VALIDATE_DOI_URL - Can enable/disable DOI URL validation for event tracking - default to false. - DATACITE_API_URL - This is used only when storing events as part of DOI validation - JWT_PUBLIC_KEY - This is used on authenticated endpoints to validate valid DataCite JWTs @@ -99,4 +100,9 @@ docker build -f ./docker/worker/Dockerfile -t keeshondworker . # Run docker with env vars docker run --network="host" --env REPO_ID=datacite.demo --env BEGIN_DATE=2022-01-01 --env END_DATE=2022-12-31 --env PLATFORM=datacite --env PUBLISHER="datacite demo" --env PUBLISHER_ID=datacite.demo keeshondworker +``` + +``` +# Connect to the local docker Clickhouse database container +clickhouse client --user=keeshond --password=keeshond ``` \ No newline at end of file From 492d7699e497cb811a795a5b93c5712683e52256 Mon Sep 17 00:00:00 2001 From: Wendel Fabian Chinsamy Date: Wed, 19 Jun 2024 17:39:42 +0200 Subject: [PATCH 4/4] remove config setting --- internal/app/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/app/config.go b/internal/app/config.go index 28591f9..b2ec489 100644 --- a/internal/app/config.go +++ b/internal/app/config.go @@ -53,7 +53,7 @@ func GetConfigFromEnv() *Config { config.DataCite.JWTPublicKey = strings.Replace(getEnv("JWT_PUBLIC_KEY", ""), `\n`, "\n", -1) // Database config - config.AnalyticsDatabase.Host = getEnv("ANALYTICS_DATABASE_HOST", "wendels-mbp.lan") + config.AnalyticsDatabase.Host = getEnv("ANALYTICS_DATABASE_HOST", "localhost") config.AnalyticsDatabase.Port = getEnv("ANALYTICS_DATABASE_PORT", "9000") config.AnalyticsDatabase.User = getEnv("ANALYTICS_DATABASE_USER", "keeshond") config.AnalyticsDatabase.Dbname = getEnv("ANALYTICS_DATABASE_DBNAME", "keeshond")