123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364 |
- package service
- import (
- "bytes"
- "compress/flate"
- "compress/gzip"
- "context"
- "crypto/rand"
- "crypto/tls"
- "encoding/hex"
- "encoding/json"
- "fmt"
- "github.com/PuerkitoBio/goquery"
- "github.com/spf13/viper"
- "github.com/tidwall/gjson"
- "io"
- "mime/multipart"
- "net/http"
- "net/url"
- "strings"
- "time"
- )
- type CrawlerService interface {
- GetLoginCookie(ctx context.Context) (string, error)
- GetFormTokens(ctx context.Context, loginUrl string, cookieHeader string) (map[string]string, error)
- SendFormData(ctx context.Context, url string, cookie string, formData map[string]interface{}) ([]byte, error)
- GetField(ctx context.Context, appName string) (map[string]interface{}, error)
- GetKey(ctx context.Context, appName string) (string, error)
- DeleteRule(ctx context.Context, ruleID int, ruleUrl string) (string, error)
- FetchPageContent(ctx context.Context, url string, cookie string) ([]byte, error)
- }
- type CrawlerConfig struct {
- Username string
- Password string
- URL string
- KeyURL string
- }
- func NewCrawlerService(
- service *Service,
- parser ParserService,
- conf *viper.Viper,
- ) CrawlerService {
- // 创建一个全局HTTP客户端,用于复用连接池
- httpClient := &http.Client{
- Transport: &http.Transport{
- TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
- MaxIdleConns: 500, // 最大空闲连接数
- MaxIdleConnsPerHost: 300, // 每个主机的最大空闲连接数
- IdleConnTimeout: 90 * time.Second, // 空闲连接超时时间
- },
- Timeout: 30 * time.Second, // 请求超时时间
- }
- return &crawlerService{
- Service: service,
- parser: parser,
- config: &CrawlerConfig{
- Username: conf.GetString("crawler.username"),
- Password: conf.GetString("crawler.password"),
- URL: conf.GetString("crawler.Url"),
- KeyURL: conf.GetString("crawler.keyUrl"),
- },
- httpClient: httpClient,
- }
- }
- type crawlerService struct {
- *Service
- parser ParserService
- config *CrawlerConfig
- httpClient *http.Client
- }
- // 生成随机字符串
- func randomHex(n int) string {
- b := make([]byte, n)
- _, err := rand.Read(b)
- if err != nil {
- panic(err)
- }
- return hex.EncodeToString(b)
- }
- // 获取登录cookie
- func (service *crawlerService) GetLoginCookie(ctx context.Context) (string, error) {
- data := url.Values{}
- data.Set("username", service.config.Username)
- data.Set("password", service.config.Password)
- loginUrl := service.config.URL + "admin/signin"
- req, err := http.NewRequestWithContext(ctx, "POST", loginUrl, strings.NewReader(data.Encode()))
- if err != nil {
- return "", fmt.Errorf("操作失败: %v", err)
- }
- // 添加关键请求头
- req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
- req.Header.Set("Expect", "")
- req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36")
- req.Header.Set("Referer", loginUrl)
- req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
- // 使用已有的HTTP客户端,但添加禁止重定向的选项
- client := &http.Client{
- // 复制原有客户端的Transport
- Transport: service.httpClient.Transport,
- // 添加禁止自动跳转的设置
- CheckRedirect: func(req *http.Request, via []*http.Request) error {
- return http.ErrUseLastResponse // 禁止自动跳转
- },
- }
- resp, err := client.Do(req)
- if err != nil {
- return "", fmt.Errorf("%v", err)
- }
- defer resp.Body.Close()
- // 输出响应体,调试用
- _, err = io.ReadAll(resp.Body)
- if err != nil {
- return "", fmt.Errorf("读取响应失败: %v", err)
- }
- // 提取原始 Header 中的 Set-Cookie 字段
- rawCookies := resp.Header["Set-Cookie"]
- var cookieStr strings.Builder
- for _, cookie := range rawCookies {
- parts := strings.SplitN(cookie, ";", 2)
- if len(parts) > 0 {
- cookieStr.WriteString(parts[0] + "; ")
- }
- }
- cookieHeader := strings.TrimRight(cookieStr.String(), "; ")
- if cookieHeader == "" {
- return "", fmt.Errorf("获取 Cookie 失败")
- }
- return cookieHeader, nil
- }
- // 获取表单令牌
- func (service *crawlerService) GetFormTokens(ctx context.Context, loginUrl string, cookieHeader string) (map[string]string, error) {
- req, err := http.NewRequestWithContext(ctx, "GET", loginUrl, nil)
- if err != nil {
- return nil, fmt.Errorf("创建请求失败: %v", err)
- }
- // 设置请求头,包括 Cookie 和 PJAX 头
- req.Header.Set("Cookie", cookieHeader)
- req.Header.Set("X-PJAX", "true")
- req.Header.Set("X-PJAX-Container", "#pjax-container")
- req.Header.Set("X-Requested-With", "XMLHttpRequest")
- req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36")
- // 使用共享的HTTP客户端发起请求
- resp, err := service.httpClient.Do(req)
- if err != nil {
- return nil, fmt.Errorf("请求失败: %v", err)
- }
- defer resp.Body.Close()
- // 使用 goquery 解析 HTML
- doc, err := goquery.NewDocumentFromReader(resp.Body)
- if err != nil {
- return nil, fmt.Errorf("解析 HTML 失败: %v", err)
- }
- // 提取隐藏字段
- previous := doc.Find(`input[name="__go_admin_previous_"]`).AttrOr("value", "默认值")
- t := doc.Find(`input[name="__go_admin_t_"]`).AttrOr("value", "默认值")
- return map[string]string{
- "previous": previous,
- "t": t,
- }, nil
- }
- // 发送 POST 请求
- func (service *crawlerService) SendFormData(ctx context.Context, url string, cookie string, formData map[string]interface{}) ([]byte, error) {
- var buf bytes.Buffer
- writer := multipart.NewWriter(&buf)
- // 遍历字段添加到 multipart 表单中
- for key, val := range formData {
- valueStr := fmt.Sprintf("%v", val) // 转为字符串
- if err := writer.WriteField(key, valueStr); err != nil {
- return nil, fmt.Errorf("写入字段失败: %v", err)
- }
- }
- // 关闭 writer 以完成结尾边界写入
- if err := writer.Close(); err != nil {
- return nil, fmt.Errorf("关闭 multipart writer 失败: %v", err)
- }
- // 构造请求
- req, err := http.NewRequestWithContext(ctx, "POST", url, &buf)
- if err != nil {
- return nil, fmt.Errorf("创建请求失败: %v", err)
- }
- // 设置请求头
- req.Header.Set("Content-Type", writer.FormDataContentType())
- req.Header.Set("X-PJAX", "true")
- req.Header.Set("X-PJAX-Container", "#pjax-container")
- req.Header.Set("X-Requested-With", "XMLHttpRequest")
- req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
- req.Header.Set("Accept", "text/html, */*; q=0.01")
- req.Header.Set("Accept-Encoding", "gzip, deflate")
- req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7")
- req.Header.Set("Cookie", cookie)
- req.Header.Set("Expect", "")
- // 使用共享的HTTP客户端发起请求
- resp, err := service.httpClient.Do(req)
- if err != nil {
- return nil, fmt.Errorf("请求发送失败: %v", err)
- }
- defer resp.Body.Close()
- // 读取响应
- res, err := io.ReadAll(resp.Body)
- if err != nil {
- return nil, fmt.Errorf("读取响应失败: %v", err)
- }
- return res, nil
- }
- // 获取 rule_id
- func (service *crawlerService) GetField(ctx context.Context, appName string) (map[string]interface{}, error) {
- keyURL := service.config.KeyURL + appName
- // 使用共享的HTTP客户端而不是http.Get
- req, err := http.NewRequestWithContext(ctx, "GET", keyURL, nil)
- if err != nil {
- return nil, fmt.Errorf("创建请求失败: %v", err)
- }
- resp, err := service.httpClient.Do(req)
- if err != nil {
- return nil, fmt.Errorf("请求失败:%w", err)
- }
- defer resp.Body.Close()
- body, err := io.ReadAll(resp.Body)
- if err != nil {
- return nil, fmt.Errorf("读取响应体失败:%w", err)
- }
- // 先用 gjson 拿到 data.raw 对应的原始 JSON
- result := gjson.GetBytes(body, "data.raw")
- if !result.Exists() {
- return nil, fmt.Errorf("响应中缺少 data.raw")
- }
- // 再把这一段反序列化到 map
- var rawMap map[string]interface{}
- if err := json.Unmarshal([]byte(result.Raw), &rawMap); err != nil {
- return nil, fmt.Errorf("解析 data.raw 失败:%w", err)
- }
- return rawMap, nil
- }
- func (service *crawlerService) GetKey(ctx context.Context, appName string) (string, error) {
- // 使用共享的HTTP客户端而不是http.Get
- req, err := http.NewRequestWithContext(ctx, "GET", service.config.KeyURL+appName, nil)
- if err != nil {
- return "", fmt.Errorf("创建请求失败: %v", err)
- }
- resp, err := service.httpClient.Do(req)
- if err != nil {
- return "", fmt.Errorf("请求失败:%w", err)
- }
- defer resp.Body.Close()
- body, err := io.ReadAll(resp.Body)
- if err != nil {
- return "", fmt.Errorf("读取响应体失败:%w", err)
- }
- // 2. 直接从 JSON 路径 data.key 拿字符串
- result := gjson.GetBytes(body, "data.key")
- if !result.Exists() {
- return "", fmt.Errorf("响应中缺少 data.key")
- }
- return result.String(), nil
- }
- func (service *crawlerService) DeleteRule(ctx context.Context, ruleID int, ruleUrl string) (string, error) {
- // 1. 登录,拿到 Cookie
- cookie, err := service.GetLoginCookie(ctx)
- if err != nil {
- return "", fmt.Errorf("login failed: %w", err)
- }
- // 2. 构造删除请求 URL 和表单
- deleteURL := service.config.URL + ruleUrl
- formData := map[string]interface{}{
- "id": ruleID,
- }
- // 3. 发表单(multipart 也支持 x-www-form-urlencoded,你这里用已有的 SendFormData)
- respBody, err := service.SendFormData(ctx, deleteURL, cookie, formData)
- if err != nil {
- return "", err
- }
- res, err := service.parser.GetMessage(ctx, respBody)
- if err != nil {
- return "", err
- }
- return res, nil
- }
- func (service *crawlerService) FetchPageContent(ctx context.Context, url string, cookie string) ([]byte, error) {
- fetchUrl := service.config.URL + url
- // 使用服务中的共享HTTP客户端
- client := service.httpClient
- // 构造请求
- req, err := http.NewRequestWithContext(ctx, "GET", fetchUrl, nil)
- if err != nil {
- return nil, fmt.Errorf("创建请求失败: %v", err)
- }
- // 设置请求头
- req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
- req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8")
- req.Header.Set("Accept-Encoding", "gzip, deflate, br")
- req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7")
- req.Header.Set("Cookie", cookie)
- // 发起请求
- resp, err := client.Do(req)
- if err != nil {
- return nil, fmt.Errorf("请求发送失败: %v", err)
- }
- defer resp.Body.Close()
- // 检查响应状态码
- if resp.StatusCode != http.StatusOK {
- return nil, fmt.Errorf("请求失败,状态码: %d", resp.StatusCode)
- }
- // 处理压缩响应
- var reader io.Reader = resp.Body
- switch resp.Header.Get("Content-Encoding") {
- case "gzip":
- gzipReader, err := gzip.NewReader(resp.Body)
- if err != nil {
- return nil, fmt.Errorf("解压 gzip 响应失败: %v", err)
- }
- defer gzipReader.Close()
- reader = gzipReader
- case "deflate":
- reader = flate.NewReader(resp.Body)
- }
- // 读取响应内容
- content, err := io.ReadAll(reader)
- if err != nil {
- return nil, fmt.Errorf("读取响应内容失败: %v", err)
- }
- return content, nil
- }
|