From d67dd201e65705df3d5c8228b5fdb27ba240c9f1 Mon Sep 17 00:00:00 2001 From: Giorgos Komninos Date: Sun, 14 May 2023 10:10:37 +0300 Subject: [PATCH] Automatic cookie handling and init job --- README.md | 1 + examples/quotes-to-scrape-app-login/go.mod | 34 +++++ examples/quotes-to-scrape-app-login/go.sum | 131 ++++++++++++++++++ examples/quotes-to-scrape-app-login/main.go | 47 +++++++ .../quotes/collect.go | 72 ++++++++++ .../quotes/login.go | 101 ++++++++++++++ .../quotes/quote.go | 36 +++++ scrapemate.go | 56 ++++++++ scrapemateapp/config.go | 12 +- scrapemateapp/scrapemateapp.go | 17 ++- 10 files changed, 498 insertions(+), 9 deletions(-) create mode 100644 examples/quotes-to-scrape-app-login/go.mod create mode 100644 examples/quotes-to-scrape-app-login/go.sum create mode 100644 examples/quotes-to-scrape-app-login/main.go create mode 100644 examples/quotes-to-scrape-app-login/quotes/collect.go create mode 100644 examples/quotes-to-scrape-app-login/quotes/login.go create mode 100644 examples/quotes-to-scrape-app-login/quotes/quote.go diff --git a/README.md b/README.md index 7b2c70c..bef1bd8 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ Scrapemate is a web crawling and scraping framework written in Golang. It is des - Caching (File/LevelDB/Custom) - Custom job providers (memory provider included) - Headless and Headful support when using JS rendering +- Automatic cookie and session handling ## Installation diff --git a/examples/quotes-to-scrape-app-login/go.mod b/examples/quotes-to-scrape-app-login/go.mod new file mode 100644 index 0000000..ef4ecc2 --- /dev/null +++ b/examples/quotes-to-scrape-app-login/go.mod @@ -0,0 +1,34 @@ +module github.com/gosom/scrapemate/quotestoscrapelogin + +go 1.20 + +require ( + github.com/PuerkitoBio/goquery v1.8.1 + github.com/google/uuid v1.3.0 + github.com/gosom/scrapemate v0.4.2 +) + +require ( + github.com/andybalholm/cascadia v1.3.1 // indirect + github.com/danwakefield/fnmatch v0.0.0-20160403171240-cbb64ac3d964 // indirect + github.com/go-playground/locales v0.14.0 // indirect + github.com/go-playground/universal-translator v0.18.0 // indirect + github.com/go-playground/validator/v10 v10.11.1 // indirect + github.com/go-stack/stack v1.8.1 // indirect + github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db // indirect + github.com/gosom/kit v0.0.0-20230309082109-543b32ac686a // indirect + github.com/leodido/go-urn v1.2.1 // indirect + github.com/mattn/go-colorable v0.1.12 // indirect + github.com/mattn/go-isatty v0.0.14 // indirect + github.com/playwright-community/playwright-go v0.2000.1 // indirect + github.com/rs/zerolog v1.28.0 // indirect + github.com/syndtr/goleveldb v1.0.0 // indirect + golang.org/x/crypto v0.3.0 // indirect + golang.org/x/net v0.7.0 // indirect + golang.org/x/sync v0.1.0 // indirect + golang.org/x/sys v0.5.0 // indirect + golang.org/x/text v0.7.0 // indirect + gopkg.in/square/go-jose.v2 v2.6.0 // indirect +) + +replace github.com/gosom/scrapemate v0.4.2 => ../../ diff --git a/examples/quotes-to-scrape-app-login/go.sum b/examples/quotes-to-scrape-app-login/go.sum new file mode 100644 index 0000000..39bf3e4 --- /dev/null +++ b/examples/quotes-to-scrape-app-login/go.sum @@ -0,0 +1,131 @@ +github.com/PuerkitoBio/goquery v1.8.1 h1:uQxhNlArOIdbrH1tr0UXwdVFgDcZDrZVdcpygAcwmWM= +github.com/PuerkitoBio/goquery v1.8.1/go.mod h1:Q8ICL1kNUJ2sXGoAhPGUdYDJvgQgHzJsnnd3H7Ho5jQ= +github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x004T2c= +github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= +github.com/coreos/go-systemd/v22 v22.3.3-0.20220203105225-a9a7ef127534/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/danwakefield/fnmatch v0.0.0-20160403171240-cbb64ac3d964 h1:y5HC9v93H5EPKqaS1UYVg1uYah5Xf51mBfIoWehClUQ= +github.com/danwakefield/fnmatch v0.0.0-20160403171240-cbb64ac3d964/go.mod h1:Xd9hchkHSWYkEqJwUGisez3G1QY8Ryz0sdWrLPMGjLk= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= +github.com/go-playground/assert/v2 v2.0.1 h1:MsBgLAaY856+nPRTKrp3/OZK38U/wa0CcBYNjji3q3A= +github.com/go-playground/assert/v2 v2.0.1/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4= +github.com/go-playground/locales v0.14.0 h1:u50s323jtVGugKlcYeyzC0etD1HifMjqmJqb8WugfUU= +github.com/go-playground/locales v0.14.0/go.mod h1:sawfccIbzZTqEDETgFXqTho0QybSa7l++s0DH+LDiLs= +github.com/go-playground/universal-translator v0.18.0 h1:82dyy6p4OuJq4/CByFNOn/jYrnRPArHwAcmLoJZxyho= +github.com/go-playground/universal-translator v0.18.0/go.mod h1:UvRDBj+xPUEGrFYl+lu/H90nyDXpg0fqeB/AQUGNTVA= +github.com/go-playground/validator/v10 v10.11.1 h1:prmOlTVv+YjZjmRmNSF3VmspqJIxJWXmqUsHwfTRRkQ= +github.com/go-playground/validator/v10 v10.11.1/go.mod h1:i+3WkQ1FvaUjjxh1kSvIA4dMGDBiPU55YFDl0WbKdWU= +github.com/go-stack/stack v1.8.1 h1:ntEHSVwIt7PNXNpgPmVfMrNhLtgjlmnZha2kOpuRiDw= +github.com/go-stack/stack v1.8.1/go.mod h1:dcoOX6HbPZSZptuspn9bctJ+N/CnF5gGygcUP3XYfe4= +github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/golang/mock v1.6.0 h1:ErTB+efbowRARo13NNdxyJji2egdxLGQhRaY+DUumQc= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db h1:woRePGFeVFfLKN/pOkfl+p/TAqKOfFu+7KPlMVpok/w= +github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= +github.com/gosom/kit v0.0.0-20230309082109-543b32ac686a h1:5tcB33GTXm0pFUiEFpmE91tMsHQj+I+W7zubT8J/ugI= +github.com/gosom/kit v0.0.0-20230309082109-543b32ac686a/go.mod h1:ngnWSsuBEpCA5Y43kZRa3x8RBYZZ4LDtvZHO4N5dHZ0= +github.com/gosom/scrapemate v0.4.2 h1:kt+XcB6fouOuLraW8fal63XKQ2ZCsH8sFwujGP0AV5U= +github.com/gosom/scrapemate v0.4.2/go.mod h1:GMka6KvSZlOiY+9f21cwNgvgawMHjVANZ2uGsGtz2Ak= +github.com/h2non/filetype v1.1.1/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY= +github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= +github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.3.0/go.mod h1:640gp4NfQd8pI5XOwp5fnNeVWj67G7CFk/SaSQn7NBk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/leodido/go-urn v1.2.1 h1:BqpAaACuzVSgi/VLzGZIobT2z4v53pjosyNd9Yv6n/w= +github.com/leodido/go-urn v1.2.1/go.mod h1:zt4jvISO2HfUBqxjfIshjdMTYS56ZS/qv49ictyFfxY= +github.com/mattn/go-colorable v0.1.12 h1:jF+Du6AlPIjs2BiUiQlKOX0rt3SujHxPnksPKZbaA40= +github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4= +github.com/mattn/go-isatty v0.0.14 h1:yVuAays6BHfxijgZPzw+3Zlu5yQgKGP2/hcQbHb7S9Y= +github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94= +github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/ginkgo v1.7.0 h1:WSHQ+IS43OoUrWtD1/bbclrwK8TTH5hzp+umCiuxHgs= +github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= +github.com/onsi/gomega v1.4.3 h1:RE1xgDvH7imwFD45h+u2SgIfERHlS2yNG4DObb5BSKU= +github.com/onsi/gomega v1.4.3/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= +github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/playwright-community/playwright-go v0.2000.1 h1:2JViSHpJQ/UL/PO1Gg6gXV5IcXAAsoBJ3KG9L3wKXto= +github.com/playwright-community/playwright-go v0.2000.1/go.mod h1:1y9cM9b9dVHnuRWzED1KLM7FtbwTJC8ibDjI6MNqewU= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rogpeppe/go-internal v1.6.1/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= +github.com/rogpeppe/go-internal v1.8.0/go.mod h1:WmiCO8CzOY8rg0OYDC4/i/2WRWAB6poM+XZ2dLUbcbE= +github.com/rs/xid v1.4.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= +github.com/rs/zerolog v1.28.0 h1:MirSo27VyNi7RJYP3078AA1+Cyzd2GB66qy3aUHvsWY= +github.com/rs/zerolog v1.28.0/go.mod h1:NILgTygv/Uej1ra5XxGf82ZFSLk58MFGAUS2o6usyD0= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= +github.com/syndtr/goleveldb v1.0.0 h1:fBdIW9lB4Iz0n9khmH8w27SJ3QEJ7+IgjPEwGSZiFdE= +github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.0.0-20211215153901-e495a2d5b3d3/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= +golang.org/x/crypto v0.3.0 h1:a06MkbcxBrEFc0w0QIZWXrH/9cCX6KJyWbBOIwAn+7A= +golang.org/x/crypto v0.3.0/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20210916014120-12bc252f5db8/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20210806184541-e5e7981a1069/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0 h1:MUK/U/4lj1t1oPg0HfuXDN/Z1wv31ZJ/YcPiGccS4DU= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0 h1:4BRB4x83lYWy72KwLD/qYDuTu7q9PjSagHvijDw7cLo= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= +gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4= +gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= +gopkg.in/square/go-jose.v2 v2.6.0 h1:NGk74WTnPKBNUhNzQX7PYcTLUjoq7mzKk2OKbvwk2iI= +gopkg.in/square/go-jose.v2 v2.6.0/go.mod h1:M9dMgbHiYLoDGQrXy7OpJDJWiKiU//h+vD76mk0e1AI= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= +gopkg.in/yaml.v2 v2.2.1 h1:mUhvW9EsL+naU5Q3cakzfE91YhliOondGd6ZrsDBHQE= +gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= diff --git a/examples/quotes-to-scrape-app-login/main.go b/examples/quotes-to-scrape-app-login/main.go new file mode 100644 index 0000000..ce42c7b --- /dev/null +++ b/examples/quotes-to-scrape-app-login/main.go @@ -0,0 +1,47 @@ +package main + +import ( + "context" + "encoding/csv" + "os" + + "github.com/gosom/scrapemate" + + "github.com/gosom/scrapemate/adapters/writers/csvwriter" + "github.com/gosom/scrapemate/quotestoscrapelogin/quotes" + "github.com/gosom/scrapemate/scrapemateapp" +) + +func main() { + if err := run(); err != nil { + os.Stderr.WriteString(err.Error() + "\n") + os.Exit(1) + return + } + os.Exit(0) +} + +func run() error { + csvWriter := csvwriter.NewCsvWriter(csv.NewWriter(os.Stdout)) + + writers := []scrapemate.ResultWriter{ + csvWriter, + } + + cfg, err := scrapemateapp.NewConfig(writers, + scrapemateapp.WithInitJob(quotes.NewLoginCRSFToken()), + ) + if err != nil { + return err + } + + app, err := scrapemateapp.NewScrapeMateApp(cfg) + if err != nil { + return err + } + + seedJobs := []scrapemate.IJob{ + quotes.NewQuoteCollectJob("https://quotes.toscrape.com/"), + } + return app.Start(context.Background(), seedJobs...) +} diff --git a/examples/quotes-to-scrape-app-login/quotes/collect.go b/examples/quotes-to-scrape-app-login/quotes/collect.go new file mode 100644 index 0000000..4bf3ec9 --- /dev/null +++ b/examples/quotes-to-scrape-app-login/quotes/collect.go @@ -0,0 +1,72 @@ +package quotes + +import ( + "context" + "errors" + "fmt" + "net/http" + "time" + + "github.com/PuerkitoBio/goquery" + "github.com/google/uuid" + "github.com/gosom/scrapemate" +) + +// QuoteCollectJob is a job that collects quotes from a page +type QuoteCollectJob struct { + scrapemate.Job +} + +// NewQuoteCollectJob creates a new QuoteCollectJob +func NewQuoteCollectJob(u string) *QuoteCollectJob { + return &QuoteCollectJob{ + Job: scrapemate.Job{ + // just give it a random id + ID: uuid.New().String(), + Method: http.MethodGet, + URL: u, + Headers: map[string]string{ + "User-Agent": scrapemate.DefaultUserAgent, + }, + Timeout: 10 * time.Second, + MaxRetries: 1, + }, + } +} + +// Process is the function that will be called by scrapemate to process the job +func (o *QuoteCollectJob) Process(ctx context.Context, resp *scrapemate.Response) (any, []scrapemate.IJob, error) { + log := scrapemate.GetLoggerFromContext(ctx) + log.Info("processing quotes collect job") + doc, ok := resp.Document.(*goquery.Document) + if !ok { + return nil, nil, fmt.Errorf("invalid document type %T expected *goquery.Document", resp.Document) + } + + if err := CheckLogin(doc); err != nil { + return nil, nil, err + } + + quotes, err := parseQuotes(doc) + if err != nil { + return nil, nil, err + } + var nextJobs []scrapemate.IJob + nextPage, err := parseNextPage(doc) + if err == nil { + nextJobs = append(nextJobs, NewQuoteCollectJob(nextPage)) + } + + return quotes, nextJobs, nil +} + +var noNextPage = errors.New("no next page") + +func parseNextPage(doc *goquery.Document) (string, error) { + nextPage := doc.Find(".next > a").AttrOr("href", "") + if nextPage == "" { + return "", noNextPage + } + nextPage = "http://quotes.toscrape.com" + nextPage + return nextPage, nil +} diff --git a/examples/quotes-to-scrape-app-login/quotes/login.go b/examples/quotes-to-scrape-app-login/quotes/login.go new file mode 100644 index 0000000..f40e9ff --- /dev/null +++ b/examples/quotes-to-scrape-app-login/quotes/login.go @@ -0,0 +1,101 @@ +package quotes + +import ( + "context" + "errors" + "fmt" + "net/http" + "net/url" + + "github.com/PuerkitoBio/goquery" + "github.com/gosom/scrapemate" +) + +type LoginJob struct { + scrapemate.Job +} + +func NewLoginJob(username, password, token string) *LoginJob { + data := url.Values{ + "csrf_token": {token}, + "username": {username}, + "password": {password}, + } + body := []byte(data.Encode()) + return &LoginJob{ + Job: scrapemate.Job{ + URL: "https://quotes.toscrape.com/login", + Method: http.MethodPost, + Headers: map[string]string{ + "Content-Type": "application/x-www-form-urlencoded", + }, + Body: body, + MaxRetries: 1, + }, + } +} + +func (o *LoginJob) Process(_ context.Context, resp *scrapemate.Response) (any, []scrapemate.IJob, error) { + + doc, ok := resp.Document.(*goquery.Document) + if !ok { + return nil, nil, fmt.Errorf("invalid document type %T expected *goquery.Document", resp.Document) + } + + if err := CheckLogin(doc); err != nil { + return nil, nil, err + } + + return nil, nil, nil +} + +type LoginCRSFToken struct { + scrapemate.Job +} + +func NewLoginCRSFToken() *LoginCRSFToken { + return &LoginCRSFToken{ + Job: scrapemate.Job{ + URL: "https://quotes.toscrape.com/login", + Method: http.MethodGet, + MaxRetries: 1, + }, + } +} + +// Process will extract the CSRF token from the login page and will create a new login job with the token +func (o *LoginCRSFToken) Process(ctx context.Context, resp *scrapemate.Response) (any, []scrapemate.IJob, error) { + log := scrapemate.GetLoggerFromContext(ctx) + log.Info("processing LoginCRSFToken job") + + doc, ok := resp.Document.(*goquery.Document) + if !ok { + return nil, nil, fmt.Errorf("invalid document type %T expected *goquery.Document", resp.Document) + } + + token, ok := doc.Find("input[name='csrf_token']").First().Attr("value") + if !ok { + return nil, nil, errors.New("could not find csrf token") + } + + next := []scrapemate.IJob{ + NewLoginJob("admin", "admin", token), + } + + return nil, next, nil +} + +func CheckLogin(doc *goquery.Document) error { + sel := `div.header-box p>a` + el := doc.Find(sel) + if el.Length() == 0 { + return errors.New("no login element found") + } + + txt := el.Text() + if txt != "Logout" { + return fmt.Errorf("invalid text %s", txt) + } + + return nil +} diff --git a/examples/quotes-to-scrape-app-login/quotes/quote.go b/examples/quotes-to-scrape-app-login/quotes/quote.go new file mode 100644 index 0000000..faef7bc --- /dev/null +++ b/examples/quotes-to-scrape-app-login/quotes/quote.go @@ -0,0 +1,36 @@ +package quotes + +import ( + "strings" + + "github.com/PuerkitoBio/goquery" +) + +type Quote struct { + Author string + Text string + Tags []string +} + +func (q Quote) CsvHeaders() []string { + return []string{"author", "text", "tags"} +} + +func (q Quote) CsvRow() []string { + return []string{q.Author, q.Text, strings.Join(q.Tags, ",")} +} + +func parseQuotes(doc *goquery.Document) ([]Quote, error) { + var quotes []Quote + doc.Find(".quote").Each(func(i int, s *goquery.Selection) { + quote := Quote{ + Author: s.Find(".author").Text(), + Text: s.Find(".text").Text(), + } + s.Find(".tag").Each(func(i int, s *goquery.Selection) { + quote.Tags = append(quote.Tags, s.Text()) + }) + quotes = append(quotes, quote) + }) + return quotes, nil +} diff --git a/scrapemate.go b/scrapemate.go index 318d74b..9131738 100644 --- a/scrapemate.go +++ b/scrapemate.go @@ -154,6 +154,21 @@ func WithCache(cache Cacher) func(*ScrapeMate) error { } } +// WithInitJob sets the first job to be processed +// It will be processed before the jobs from the job provider +// It is useful if you want to start the scraper with a specific job +// instead of the first one from the job provider +// A real use case is when you want to obtain some cookies before starting +// the scraping process (e.g. login) +// Important: The results from these job will be discarded ! +func WithInitJob(job IJob) func(*ScrapeMate) error { + return func(s *ScrapeMate) error { + s.initJob = job + + return nil + } +} + // Scrapemate contains unexporter fields type ScrapeMate struct { log logging.Logger @@ -166,6 +181,7 @@ type ScrapeMate struct { cache Cacher results chan Result failedJobs chan IJob + initJob IJob } // Start starts the scraper @@ -187,6 +203,10 @@ func (s *ScrapeMate) Start() error { signal.Notify(exitChan, os.Interrupt, syscall.SIGTERM) s.waitForSignal(exitChan) + if err := s.processInitJob(s.ctx); err != nil { + return err + } + wg := sync.WaitGroup{} wg.Add(s.concurrency) @@ -391,6 +411,42 @@ func (s *ScrapeMate) waitForSignal(sigChan <-chan os.Signal) { }() } +func (s *ScrapeMate) processInitJob(ctx context.Context) error { + if s.initJob == nil { + return nil + } + + s.log.Info("processing init", "job", s.initJob) + defer s.log.Info("init job finished", "job", s.initJob) + + var stack []IJob + + if s.initJob != nil { + stack = append(stack, s.initJob) + } + + var job IJob + + for len(stack) > 0 { + select { + case <-ctx.Done(): + return nil + default: + } + + job, stack = stack[0], stack[1:] + + _, next, err := s.DoJob(ctx, job) + if err != nil { + return err + } + + stack = append(stack, next...) + } + + return nil +} + func (s *ScrapeMate) startWorker(ctx context.Context) { jobc, errc := s.jobProvider.Jobs(ctx) diff --git a/scrapemateapp/config.go b/scrapemateapp/config.go index 8442765..5c51bd2 100644 --- a/scrapemateapp/config.go +++ b/scrapemateapp/config.go @@ -72,10 +72,10 @@ func WithProvider(provider scrapemate.JobProvider) func(*Config) error { } } -// WithUseSession sets UseSession to true. -func WithUseSession() func(*Config) error { +// WithInitJob sets the initial job of the app. +func WithInitJob(job scrapemate.IJob) func(*Config) error { return func(o *Config) error { - o.UseSession = true + o.InitJob = job return nil } @@ -121,10 +121,8 @@ type Config struct { // Writers are the writers to use for writing the results. // At least one writer must be provided. Writers []scrapemate.ResultWriter `validate:"required,gt=0"` - - // UseSession is whether to use a session for the scraper - // only works with JS enabled for now. - UseSession bool `validate:"omitempty"` + // InitJob is the job to initialize the app with. + InitJob scrapemate.IJob } func (o *Config) validate() error { diff --git a/scrapemateapp/scrapemateapp.go b/scrapemateapp/scrapemateapp.go index add0414..45b0cb3 100644 --- a/scrapemateapp/scrapemateapp.go +++ b/scrapemateapp/scrapemateapp.go @@ -4,6 +4,7 @@ import ( "context" "errors" "net/http" + "net/http/cookiejar" "time" "github.com/gosom/scrapemate" @@ -118,6 +119,10 @@ func (app *ScrapemateApp) getMate(ctx context.Context) (*scrapemate.ScrapeMate, params = append(params, scrapemate.WithCache(app.cacher)) } + if app.cfg.InitJob != nil { + params = append(params, scrapemate.WithInitJob(app.cfg.InitJob)) + } + return scrapemate.New(params...) } @@ -152,9 +157,17 @@ func (app *ScrapemateApp) getFetcher() (scrapemate.HTTPFetcher, error) { return nil, err } default: - httpFetcher = fetcher.New(&http.Client{ + cookieJar, err := cookiejar.New(nil) + if err != nil { + return nil, err + } + + netClient := &http.Client{ Timeout: timeout, - }) + Jar: cookieJar, + } + + httpFetcher = fetcher.New(netClient) } return httpFetcher, nil