You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
45 lines
1.1 KiB
Go
45 lines
1.1 KiB
Go
package scrapeutils
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strings"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/imroc/req/v3"
|
|
)
|
|
|
|
const userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.0.0"
|
|
|
|
var ReqClient = req.NewClient().
|
|
DisableAutoReadResponse().
|
|
EnableInsecureSkipVerify().
|
|
SetUserAgent(userAgent)
|
|
|
|
func GetHTML(ctx context.Context, url string) (string, error) {
|
|
res, err := ReqClient.R().SetContext(ctx).SetRetryCount(3).Get(url)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to get url: %w", err)
|
|
}
|
|
html, err := res.ToString()
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to read response body: %w", err)
|
|
}
|
|
return html, nil
|
|
}
|
|
|
|
func GetParsed(ctx context.Context, url string) (*goquery.Document, error) {
|
|
html, err := GetHTML(ctx, url)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get html: %w", err)
|
|
}
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse html: %w", err)
|
|
}
|
|
return doc, nil
|
|
}
|
|
|
|
type Fetcher struct {
|
|
}
|