From b63a05ecd4ae0629824493c17371198da5a033b8 Mon Sep 17 00:00:00 2001 From: Adrian Sieber Date: Mon, 8 Jan 2024 11:12:12 +0000 Subject: [PATCH] Post all repos of a crawled batch to Airsequel at once --- app/Airsequel.hs | 206 ++++++++++++++++++++--------------------------- app/Main.hs | 37 +++++---- app/Types.hs | 97 +++++++++++++--------- 3 files changed, 168 insertions(+), 172 deletions(-) diff --git a/app/Airsequel.hs b/app/Airsequel.hs index cf730d0..58fc07c 100644 --- a/app/Airsequel.hs +++ b/app/Airsequel.hs @@ -5,17 +5,20 @@ {-# HLINT ignore "Use maybe" #-} {-# HLINT ignore "Use unless" #-} +{-# HLINT ignore "Use fmap" #-} module Airsequel where import Protolude ( Either (..), IO, - Int, + Integer, Maybe (..), Text, encodeUtf8, + filter, fromMaybe, + isJust, print, pure, putErrLn, @@ -64,6 +67,7 @@ import Network.HTTP.Client.TLS (tlsManagerSettings) import Network.HTTP.Types (statusCode) import Text.RawString.QQ (r) +import Numeric (showInt) import Types (GqlRes (..), Repo (..), SaveStrategy (..)) import Utils (loadAirsWriteToken, loadDbEndpoint) @@ -90,39 +94,9 @@ setRequestFields airseqWriteToken query variables req = upsertRepoQuery :: Text upsertRepoQuery = do [r| - mutation InsertRepo ( - $rowid: Int - $github_id: Int! - $owner: String! - $name: String! - $description: String - $homepage: String - $language: String - $stargazers_count: Int - $open_issues_count: Int - $commits_count: Int - $is_archived: Boolean - $created_utc: String - $updated_utc: String - $crawled_utc: String! - ) { + mutation InsertRepo( $objects: [repos_insert_input!]! ) { insert_repos( - objects: [{ - rowid: $rowid - github_id: $github_id - owner: $owner - name: $name - description: $description - homepage: $homepage - language: $language - stargazers_count: $stargazers_count - open_issues_count: $open_issues_count - commits_count: $commits_count - is_archived: $is_archived - created_utc: $created_utc - updated_utc: $updated_utc - crawled_utc: $crawled_utc - }] + objects: $objects on_conflict: { constraint: rowid update_columns: [ @@ -149,81 +123,82 @@ upsertRepoQuery = do |] --- | Get rowid of a repo with the specified GitHub ID -getRowid :: Manager -> Text -> Text -> Repo -> IO (Maybe Int) -getRowid manager dbEndpoint airseqWriteToken repo = do +-- | Load Airsequel rowid of repos by their GitHub ID +loadRowids :: Manager -> Text -> Text -> [Repo] -> IO [Repo] +loadRowids manager dbEndpoint airseqWriteToken repos = do let - githubId = repo.githubId - - getRowidQuery :: Text - getRowidQuery = + getReposWithRowidQuery :: Text + getReposWithRowidQuery = [r| - query GetRowid($github_id: Int!) { - repos( - filter: { github_id: { eq: $github_id } } - ) { + query GetRowids ($githubIds: [Int]) { + repos(filter: { github_id: { in: $githubIds } }) { + databaseId: github_id rowid } } |] - initialGetRowidRequest <- parseRequest $ T.unpack dbEndpoint + initialGetRowidsRequest <- parseRequest $ T.unpack dbEndpoint - let getRowidRequest = + let getRowidsRequest = setRequestFields airseqWriteToken - getRowidQuery - (KeyMap.fromList ["github_id" .= githubId]) - initialGetRowidRequest - getRowidResponse <- httpLbs getRowidRequest manager + getReposWithRowidQuery + (KeyMap.fromList ["githubIds" .= (repos <&> githubId)]) + initialGetRowidsRequest + + getReposWithRowidResponse <- httpLbs getRowidsRequest manager when - (getRowidResponse.responseStatus.statusCode /= 200) - (putErrText $ show getRowidResponse.responseBody) + (getReposWithRowidResponse.responseStatus.statusCode /= 200) + (putErrText $ show getReposWithRowidResponse.responseBody) let - repoSlug = repo.owner <> "/" <> repo.name - msgBase = - "Repo \"" <> repoSlug <> "\" is not" - - rowidResult :: Either [P.Char] Int = - ( getRowidResponse.responseBody - & eitherDecode - :: Either [P.Char] Object - ) - >>= ( \gqlRes -> - P.flip parseEither gqlRes $ \gqlResObj -> do - gqlData <- gqlResObj .: "data" - gqlData .: "repos" - ) - >>= ( \case - [] -> Left $ T.unpack $ msgBase <> " in Airsequel yet" - [repoObj :: Object] -> parseEither (.: "rowid") repoObj - _ -> - Left $ - T.unpack $ - "Error: " <> msgBase <> " unique in Airsequel" - ) - - case rowidResult of + ghIdsWithRowid + :: Either [P.Char] [(Integer {- githubId -}, Integer {- rowid -})] = + (getReposWithRowidResponse.responseBody & eitherDecode) + >>= ( \gqlRes -> + P.flip parseEither gqlRes $ \gqlResObj -> do + gqlData <- gqlResObj .: "data" + gqlData .: "repos" + ) + <&> ( \(reposWithRowid :: [Repo]) -> + reposWithRowid + & filter (\repoWithRowid -> isJust repoWithRowid.rowid) + <&> ( \repoWithRowid -> + ( repoWithRowid.githubId + , repoWithRowid.rowid & fromMaybe 0 + ) + ) + ) + + case ghIdsWithRowid of Left err -> do putErrLn err - pure Nothing - Right rowid -> do - P.putText $ - "Repo \"" - <> repoSlug - <> "\" is already in Airsequel " - <> ("(rowid " <> show rowid <> ") ") - <> "and will be updated." - pure $ Just rowid - - -{-| Insert or upsert the repo in Airsequel + pure repos + Right rowids -> do + P.putStrLn $ + showInt (P.length rowids) " of " + <> showInt (P.length repos) " repos already exist in Airsequel" + + pure $ + repos <&> \repo -> + repo + { rowid = + rowids + & filter (\(ghId, _) -> ghId == repo.githubId) + & P.head + <&> P.snd + } + + +{-| Insert or upsert repos in Airsequel via a POST request executed by http-client -} -saveRepoInAirsequel :: SaveStrategy -> Repo -> IO () -saveRepoInAirsequel saveStrategy repo = do +saveReposInAirsequel :: SaveStrategy -> [Repo] -> IO () +saveReposInAirsequel saveStrategy repos = do + P.putText $ "⏳ Saving " <> show (P.length repos) <> " repos in Airsequel …" + dbEndpoint <- loadDbEndpoint airseqWriteToken <- loadAirsWriteToken @@ -231,43 +206,38 @@ saveRepoInAirsequel saveStrategy repo = do now <- getCurrentTime <&> (iso8601Show >>> T.pack) - -- Get rowid for the repo to execute an upsert - -- if the save strategy is to overwrite - rowidMb <- - if saveStrategy == OverwriteRepo - then - getRowid - manager - dbEndpoint - airseqWriteToken - repo - else pure Nothing + -- Get rowid for repos if repos shall be overwritten + reposNorm <- case saveStrategy of + OverwriteRepo -> loadRowids manager dbEndpoint airseqWriteToken repos + AddRepo -> pure repos initialInsertRequest <- parseRequest $ T.unpack dbEndpoint let - variables = - [ "rowid" .= rowidMb - , "github_id" .= repo.githubId - , "owner" .= repo.owner - , "name" .= repo.name - , "description" .= repo.description - , "homepage" .= repo.homepageUrl - , "language" .= repo.primaryLanguage - , "stargazers_count" .= repo.stargazerCount - , "open_issues_count" .= repo.openIssuesCount - , "commits_count" .= (repo.commitsCount & fromMaybe 0) - , "is_archived" .= repo.isArchived - , "created_utc" .= iso8601Show repo.createdAt - , "updated_utc" .= iso8601Show repo.updatedAt - , "crawled_utc" .= now - ] + objects = + reposNorm <&> \repo -> + object + [ "rowid" .= repo.rowid + , "github_id" .= repo.githubId + , "owner" .= repo.owner + , "name" .= repo.name + , "description" .= repo.description + , "homepage" .= repo.homepageUrl + , "language" .= repo.primaryLanguage + , "stargazers_count" .= repo.stargazerCount + , "open_issues_count" .= repo.openIssuesCount + , "commits_count" .= (repo.commitsCount & fromMaybe 0) + , "is_archived" .= repo.isArchived + , "created_utc" .= (repo.createdAt <&> iso8601Show) + , "updated_utc" .= (repo.updatedAt <&> iso8601Show) + , "crawled_utc" .= now + ] insertRequest = setRequestFields airseqWriteToken upsertRepoQuery - (KeyMap.fromList variables) + (KeyMap.fromList ["objects" .= objects]) initialInsertRequest insertResponse <- httpLbs insertRequest manager diff --git a/app/Main.hs b/app/Main.hs index 0eda5ed..e64fe83 100644 --- a/app/Main.hs +++ b/app/Main.hs @@ -23,6 +23,7 @@ import Protolude ( mempty, pure, putErrText, + putStrLn, putText, show, when, @@ -75,7 +76,8 @@ import Options.Applicative ( ) import Text.RawString.QQ (r) -import Airsequel (saveRepoInAirsequel) +import Airsequel (saveReposInAirsequel) +import Numeric (showInt) import Options.Applicative.Help.Pretty (vsep) import Types (GqlRepoRes (..), Repo (..), SaveStrategy (..)) import Utils (loadGitHubToken) @@ -127,9 +129,14 @@ commands = do formatRepo :: Repo -> Text -formatRepo repo = +formatRepo repo = do + let repoSlug = + (repo.owner & fromMaybe "") + <> "/" + <> (repo.name & fromMaybe "") + "\n\n" - <> ("repo_url: github.com/" <> repo.owner <> "/" <> repo.name <> "\n") + <> ("repo_url: github.com/" <> repoSlug <> "\n") <> ("description: " <> (repo.description & fromMaybe "") <> "\n") <> ("homepage: " <> (repo.homepageUrl & fromMaybe "") <> "\n") <> ("language: " <> (repo.primaryLanguage & fromMaybe "") <> "\n") @@ -221,6 +228,8 @@ getGhHeaders tokenMb = execGithubGqlQuery :: Maybe Text -> Text -> KeyMap Value -> [Repo] -> IO [Repo] execGithubGqlQuery ghTokenMb query variables initialRepos = do + putText "\n▶️ Query a batch of repos from GitHub …" + manager <- newManager tlsManagerSettings initialRequest <- parseRequest $ T.unpack "https://api.github.com/graphql" @@ -254,16 +263,16 @@ execGithubGqlQuery ghTokenMb query variables initialRepos = do let repos :: [Repo] = gqlResponse.repos - putText $ + putStrLn $ "✅ Received " - <> show (P.length repos) - <> " repos from GitHub" + <> showInt (P.length repos) " repos " + <> "from GitHub" repos <&> ( \repo -> - repo.owner + (repo.owner & fromMaybe "") <> ("/" :: Text) - <> repo.name + <> (repo.name & fromMaybe "") <> (" | stars: " :: Text) <> show repo.stargazerCount <> (" | commits: " :: Text) @@ -275,13 +284,7 @@ execGithubGqlQuery ghTokenMb query variables initialRepos = do & mapM_ putText when (P.not $ P.null repos) $ do - putText $ - "⏳ Save " - <> show (P.length repos) - <> " repos to Airsequel …" - -- TODO: Save all repos in one request - repos - & mapM_ (saveRepoInAirsequel OverwriteRepo) + saveReposInAirsequel OverwriteRepo repos case gqlResponse.nextCursorMb of Nothing -> pure $ initialRepos <> repos @@ -394,7 +397,9 @@ run cliCmd = do repos <- loadAndSaveReposViaSearch ghTokenMb searchQueryNorm 20 Nothing - putText $ "🏁 Total number of crawled repos: " <> show (P.length repos) + putStrLn $ + "🏁 Total number of crawled repos: " + <> showInt (P.length repos) "" pure () diff --git a/app/Types.hs b/app/Types.hs index 82dedc4..a80b8aa 100644 --- a/app/Types.hs +++ b/app/Types.hs @@ -1,10 +1,9 @@ module Types where import Protolude ( - Bool (False), + Bool (..), Eq, Generic, - Int, Integer, Maybe (..), Show, @@ -18,22 +17,22 @@ import Protolude ( import Data.Aeson (FromJSON, Object, Value, withObject, (.:), (.:?)) import Data.Aeson.Types (parseJSON) -import Data.Time (UTCTime (UTCTime), fromGregorian, secondsToDiffTime) +import Data.Time (UTCTime) data Repo = Repo - { rowid :: Maybe Int -- Airsequel rowid - , owner :: Text - , name :: Text - , githubId :: Int - , stargazerCount :: Int + { githubId :: Integer + , rowid :: Maybe Integer -- Airsequel rowid + , owner :: Maybe Text + , name :: Maybe Text + , stargazerCount :: Maybe Integer , description :: Maybe Text , homepageUrl :: Maybe Text , primaryLanguage :: Maybe Text - , openIssuesCount :: Int - , isArchived :: Bool - , createdAt :: UTCTime - , updatedAt :: UTCTime + , openIssuesCount :: Maybe Integer + , isArchived :: Maybe Bool + , createdAt :: Maybe UTCTime + , updatedAt :: Maybe UTCTime , commitsCount :: Maybe Integer } deriving (Show, Eq, Generic) @@ -42,42 +41,64 @@ data Repo = Repo emptyRepo :: Repo emptyRepo = Repo - { rowid = Nothing - , owner = "" - , name = "" - , githubId = 0 - , stargazerCount = 0 + { githubId = 0 + , rowid = Nothing + , owner = Nothing + , name = Nothing + , stargazerCount = Nothing , description = Nothing , homepageUrl = Nothing , primaryLanguage = Nothing - , openIssuesCount = 0 - , isArchived = False - , createdAt = UTCTime (fromGregorian 1900 1 1) (secondsToDiffTime 0) - , updatedAt = UTCTime (fromGregorian 1900 1 1) (secondsToDiffTime 0) + , openIssuesCount = Nothing + , isArchived = Nothing + , createdAt = Nothing + , updatedAt = Nothing , commitsCount = Nothing } instance FromJSON Repo where parseJSON = withObject "RepoObject" $ \o -> do - owner <- o .: "owner" >>= (.: "login") - name <- o .: "name" githubId <- o .: "databaseId" - stargazerCount <- o .: "stargazerCount" - description <- o .: "description" - homepageUrl <- o .: "homepageUrl" - primaryLanguage <- o .: "primaryLanguage" >>= (.: "name") - openIssuesCount <- o .: "issues" >>= (.: "totalCount") - isArchived <- o .: "isArchived" - createdAt <- o .: "createdAt" - updatedAt <- o .: "updatedAt" - commitsCount <- - o .: "defaultBranchRef" - >>= (.: "target") - >>= (.: "history") - >>= (.: "totalCount") - - pure Repo{rowid = Nothing, ..} + rowid <- o .:? "rowid" + + ownerMb <- o .:? "owner" + owner <- case ownerMb of + Nothing -> pure Nothing + Just ownerObj -> ownerObj .: "login" + + nameMb <- o .:? "name" + name <- case nameMb of + Nothing -> pure Nothing + Just name -> pure $ Just name + + stargazerCount <- o .:? "stargazerCount" + description <- o .:? "description" + homepageUrl <- o .:? "homepageUrl" + + primaryLanguageMb <- o .:? "primaryLanguage" + primaryLanguage <- case primaryLanguageMb of + Nothing -> pure Nothing + Just langObj -> langObj .: "name" + + openIssuesCountMb <- o .:? "issues" + openIssuesCount <- case openIssuesCountMb of + Nothing -> pure Nothing + Just issuesObj -> issuesObj .: "totalCount" + + isArchived <- o .:? "isArchived" + createdAt <- o .:? "createdAt" + updatedAt <- o .:? "updatedAt" + + defaultBranchRef <- o .:? "defaultBranchRef" + commitsCount <- case defaultBranchRef of + Nothing -> pure Nothing + Just branchRef -> + branchRef .: "target" + >>= (.: "history") + >>= (.: "totalCount") + + pure Repo{..} -- | Generic GraphQL response