package main import ( "fmt" "net/http" "regexp" "strings" "sync" "time" "github.com/PuerkitoBio/goquery" "github.com/gocolly/colly" "github.com/hashicorp/go-retryablehttp" "github.com/pkg/errors" "github.com/rs/zerolog/log" "github.com/samber/lo" "github.com/spf13/cobra" "github.com/uptrace/bun" "git.lehouerou.net/laurent/sorarebuddy/cmd/common" "git.lehouerou.net/laurent/sorarebuddy/db" ) var Cmd = &cobra.Command{ Use: "console", Short: "console", Long: `console`, RunE: run, PersistentPreRunE: common.CmdPreRunE, } func main() { common.Start(Cmd) } func init() { common.InitParams(Cmd) } func run(cmd *cobra.Command, _ []string) error { httpClient := retryablehttp.NewClient() ctx := cmd.Context() dbconn, ok := ctx.Value(common.DbContextKey).(*bun.DB) if !ok { return errors.New("db not found in context") } dbclient := db.NewClient(dbconn) players, err := dbclient.Players.GetAll(ctx) if err != nil { return errors.Wrap(err, "") } for _, p := range lo.Chunk(players, 100)[0] { name := strings.ReplaceAll(p.DisplayName, " ", "+") _, url, err := goQueryFromURL(httpClient.StandardClient(), fmt.Sprintf("https://fbref.com/fr/search/search.fcgi?hint=&search=%s", name), ) if err != nil { return errors.Wrap(err, "") } log.Debug().Msgf("%s -> %s", p.DisplayName, url) } return nil } func goQueryFromURL(httpClient *http.Client, url string) (*goquery.Document, string, error) { res, err := httpClient.Get(url) if err != nil { return nil, "", errors.Wrap(err, "requesting url") } defer res.Body.Close() log.Debug().Int("status", res.StatusCode).Str("url", url).Msg("HTTP request completed") if res.StatusCode < 200 || res.StatusCode >= 300 { return nil, "", errors.Wrapf(err, "requesting failed at the http level: %d %s", res.StatusCode, res.Status) } doc, err := goquery.NewDocumentFromReader(res.Body) if err != nil { return nil, "", errors.Wrap(err, "parsing html") } return doc, res.Request.URL.String(), nil } func scrapePlayers() { c := colly.NewCollector( colly.AllowedDomains("fbref.com"), colly.Async(true), ) players := make(map[string]bool) mutex := &sync.Mutex{} c.Limit(&colly.LimitRule{ DomainGlob: "fbref.com", Parallelism: 2, Delay: 1 * time.Second, }) c.OnHTML("a[href]", func(e *colly.HTMLElement) { link := e.Attr("href") if matched, _ := regexp.MatchString(`^/fr/joueurs/[a-z0-9]{8}/[a-zA-Z-]+$`, link); matched { splitLink := strings.Split(link, "/") playerName := splitLink[len(splitLink)-1] mutex.Lock() if _, ok := players[playerName]; !ok { players[playerName] = true mutex.Unlock() link = strings.Join(splitLink[:len(splitLink)-1], "/") + "/scout/365_m2/Rapport-de-scouting-" + playerName c.Visit(e.Request.AbsoluteURL(link)) } else { mutex.Unlock() } } }) c.OnRequest(func(r *colly.Request) { log.Debug().Str("url", r.URL.String()).Msg("") }) c.Visit("https://fbref.com/fr/joueurs/df69b544/Antoine-Griezmann") c.Wait() for playerName := range players { log.Debug().Str("player_name", playerName).Msg("") } }