130 lines
3.1 KiB
Go
130 lines
3.1 KiB
Go
|
package main
|
||
|
|
||
|
import (
|
||
|
"fmt"
|
||
|
"net/http"
|
||
|
"regexp"
|
||
|
"strings"
|
||
|
"sync"
|
||
|
"time"
|
||
|
|
||
|
"github.com/PuerkitoBio/goquery"
|
||
|
"github.com/gocolly/colly"
|
||
|
"github.com/hashicorp/go-retryablehttp"
|
||
|
"github.com/pkg/errors"
|
||
|
"github.com/rs/zerolog/log"
|
||
|
"github.com/samber/lo"
|
||
|
"github.com/spf13/cobra"
|
||
|
"github.com/uptrace/bun"
|
||
|
|
||
|
"git.lehouerou.net/laurent/sorarebuddy/cmd/common"
|
||
|
"git.lehouerou.net/laurent/sorarebuddy/db"
|
||
|
)
|
||
|
|
||
|
var Cmd = &cobra.Command{
|
||
|
Use: "console",
|
||
|
Short: "console",
|
||
|
Long: `console`,
|
||
|
RunE: run,
|
||
|
PersistentPreRunE: common.CmdPreRunE,
|
||
|
}
|
||
|
|
||
|
func main() {
|
||
|
common.Start(Cmd)
|
||
|
}
|
||
|
|
||
|
func init() {
|
||
|
common.InitParams(Cmd)
|
||
|
}
|
||
|
|
||
|
func run(cmd *cobra.Command, _ []string) error {
|
||
|
|
||
|
httpClient := retryablehttp.NewClient()
|
||
|
|
||
|
ctx := cmd.Context()
|
||
|
dbconn, ok := ctx.Value(common.DbContextKey).(*bun.DB)
|
||
|
if !ok {
|
||
|
return errors.New("db not found in context")
|
||
|
}
|
||
|
dbclient := db.NewClient(dbconn)
|
||
|
|
||
|
players, err := dbclient.Players.GetAll(ctx)
|
||
|
if err != nil {
|
||
|
return errors.Wrap(err, "")
|
||
|
}
|
||
|
for _, p := range lo.Chunk(players, 100)[0] {
|
||
|
name := strings.ReplaceAll(p.DisplayName, " ", "+")
|
||
|
_, url, err := goQueryFromURL(httpClient.StandardClient(),
|
||
|
fmt.Sprintf("https://fbref.com/fr/search/search.fcgi?hint=&search=%s", name),
|
||
|
)
|
||
|
if err != nil {
|
||
|
return errors.Wrap(err, "")
|
||
|
}
|
||
|
log.Debug().Msgf("%s -> %s", p.DisplayName, url)
|
||
|
}
|
||
|
|
||
|
return nil
|
||
|
|
||
|
}
|
||
|
|
||
|
func goQueryFromURL(httpClient *http.Client, url string) (*goquery.Document, string, error) {
|
||
|
res, err := httpClient.Get(url)
|
||
|
if err != nil {
|
||
|
return nil, "", errors.Wrap(err, "requesting url")
|
||
|
}
|
||
|
defer res.Body.Close()
|
||
|
log.Debug().Int("status", res.StatusCode).Str("url", url).Msg("HTTP request completed")
|
||
|
if res.StatusCode < 200 || res.StatusCode >= 300 {
|
||
|
return nil, "", errors.Wrapf(err, "requesting failed at the http level: %d %s", res.StatusCode, res.Status)
|
||
|
}
|
||
|
doc, err := goquery.NewDocumentFromReader(res.Body)
|
||
|
if err != nil {
|
||
|
return nil, "", errors.Wrap(err, "parsing html")
|
||
|
}
|
||
|
return doc, res.Request.URL.String(), nil
|
||
|
}
|
||
|
|
||
|
func scrapePlayers() {
|
||
|
c := colly.NewCollector(
|
||
|
colly.AllowedDomains("fbref.com"),
|
||
|
colly.Async(true),
|
||
|
)
|
||
|
|
||
|
players := make(map[string]bool)
|
||
|
mutex := &sync.Mutex{}
|
||
|
|
||
|
c.Limit(&colly.LimitRule{
|
||
|
DomainGlob: "fbref.com",
|
||
|
Parallelism: 2,
|
||
|
Delay: 1 * time.Second,
|
||
|
})
|
||
|
|
||
|
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
|
||
|
link := e.Attr("href")
|
||
|
if matched, _ := regexp.MatchString(`^/fr/joueurs/[a-z0-9]{8}/[a-zA-Z-]+$`, link); matched {
|
||
|
splitLink := strings.Split(link, "/")
|
||
|
playerName := splitLink[len(splitLink)-1]
|
||
|
mutex.Lock()
|
||
|
if _, ok := players[playerName]; !ok {
|
||
|
players[playerName] = true
|
||
|
mutex.Unlock()
|
||
|
link = strings.Join(splitLink[:len(splitLink)-1], "/") + "/scout/365_m2/Rapport-de-scouting-" + playerName
|
||
|
c.Visit(e.Request.AbsoluteURL(link))
|
||
|
} else {
|
||
|
mutex.Unlock()
|
||
|
}
|
||
|
}
|
||
|
})
|
||
|
|
||
|
c.OnRequest(func(r *colly.Request) {
|
||
|
log.Debug().Str("url", r.URL.String()).Msg("")
|
||
|
})
|
||
|
|
||
|
c.Visit("https://fbref.com/fr/joueurs/df69b544/Antoine-Griezmann")
|
||
|
c.Wait()
|
||
|
|
||
|
for playerName := range players {
|
||
|
log.Debug().Str("player_name", playerName).Msg("")
|
||
|
}
|
||
|
}
|