sorarebuddy/cmd/fbrefscraper/root.go
2024-05-23 08:18:54 +04:00

130 lines
3.1 KiB
Go

package main
import (
"fmt"
"net/http"
"regexp"
"strings"
"sync"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/gocolly/colly"
"github.com/hashicorp/go-retryablehttp"
"github.com/pkg/errors"
"github.com/rs/zerolog/log"
"github.com/samber/lo"
"github.com/spf13/cobra"
"github.com/uptrace/bun"
"git.lehouerou.net/laurent/sorarebuddy/cmd/common"
"git.lehouerou.net/laurent/sorarebuddy/db"
)
var Cmd = &cobra.Command{
Use: "console",
Short: "console",
Long: `console`,
RunE: run,
PersistentPreRunE: common.CmdPreRunE,
}
func main() {
common.Start(Cmd)
}
func init() {
common.InitParams(Cmd)
}
func run(cmd *cobra.Command, _ []string) error {
httpClient := retryablehttp.NewClient()
ctx := cmd.Context()
dbconn, ok := ctx.Value(common.DbContextKey).(*bun.DB)
if !ok {
return errors.New("db not found in context")
}
dbclient := db.NewClient(dbconn)
players, err := dbclient.Players.GetAll(ctx)
if err != nil {
return errors.Wrap(err, "")
}
for _, p := range lo.Chunk(players, 100)[0] {
name := strings.ReplaceAll(p.DisplayName, " ", "+")
_, url, err := goQueryFromURL(httpClient.StandardClient(),
fmt.Sprintf("https://fbref.com/fr/search/search.fcgi?hint=&search=%s", name),
)
if err != nil {
return errors.Wrap(err, "")
}
log.Debug().Msgf("%s -> %s", p.DisplayName, url)
}
return nil
}
func goQueryFromURL(httpClient *http.Client, url string) (*goquery.Document, string, error) {
res, err := httpClient.Get(url)
if err != nil {
return nil, "", errors.Wrap(err, "requesting url")
}
defer res.Body.Close()
log.Debug().Int("status", res.StatusCode).Str("url", url).Msg("HTTP request completed")
if res.StatusCode < 200 || res.StatusCode >= 300 {
return nil, "", errors.Wrapf(err, "requesting failed at the http level: %d %s", res.StatusCode, res.Status)
}
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return nil, "", errors.Wrap(err, "parsing html")
}
return doc, res.Request.URL.String(), nil
}
func scrapePlayers() {
c := colly.NewCollector(
colly.AllowedDomains("fbref.com"),
colly.Async(true),
)
players := make(map[string]bool)
mutex := &sync.Mutex{}
c.Limit(&colly.LimitRule{
DomainGlob: "fbref.com",
Parallelism: 2,
Delay: 1 * time.Second,
})
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")
if matched, _ := regexp.MatchString(`^/fr/joueurs/[a-z0-9]{8}/[a-zA-Z-]+$`, link); matched {
splitLink := strings.Split(link, "/")
playerName := splitLink[len(splitLink)-1]
mutex.Lock()
if _, ok := players[playerName]; !ok {
players[playerName] = true
mutex.Unlock()
link = strings.Join(splitLink[:len(splitLink)-1], "/") + "/scout/365_m2/Rapport-de-scouting-" + playerName
c.Visit(e.Request.AbsoluteURL(link))
} else {
mutex.Unlock()
}
}
})
c.OnRequest(func(r *colly.Request) {
log.Debug().Str("url", r.URL.String()).Msg("")
})
c.Visit("https://fbref.com/fr/joueurs/df69b544/Antoine-Griezmann")
c.Wait()
for playerName := range players {
log.Debug().Str("player_name", playerName).Msg("")
}
}