package service import ( "bytes" "compress/flate" "compress/gzip" "context" "crypto/rand" "crypto/tls" "encoding/hex" "encoding/json" "fmt" "github.com/PuerkitoBio/goquery" "github.com/spf13/viper" "github.com/tidwall/gjson" "io" "mime/multipart" "net/http" "net/url" "strings" "time" ) type CrawlerService interface { GetLoginCookie(ctx context.Context) (string, error) GetFormTokens(ctx context.Context, loginUrl string, cookieHeader string) (map[string]string, error) SendFormData(ctx context.Context, url string, cookie string, formData map[string]interface{}) ([]byte, error) GetField(ctx context.Context, appName string) (map[string]interface{}, error) GetKey(ctx context.Context, appName string) (string, error) DeleteRule(ctx context.Context, ruleID int, ruleUrl string) (string, error) FetchPageContent(ctx context.Context, url string, cookie string) ([]byte, error) } type CrawlerConfig struct { Username string Password string URL string KeyURL string } func NewCrawlerService( service *Service, parser ParserService, conf *viper.Viper, ) CrawlerService { // 创建一个全局HTTP客户端,用于复用连接池 httpClient := &http.Client{ Transport: &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, MaxIdleConns: 500, // 最大空闲连接数 MaxIdleConnsPerHost: 300, // 每个主机的最大空闲连接数 IdleConnTimeout: 90 * time.Second, // 空闲连接超时时间 }, Timeout: 30 * time.Second, // 请求超时时间 } return &crawlerService{ Service: service, parser: parser, config: &CrawlerConfig{ Username: conf.GetString("crawler.username"), Password: conf.GetString("crawler.password"), URL: conf.GetString("crawler.Url"), KeyURL: conf.GetString("crawler.keyUrl"), }, httpClient: httpClient, } } type crawlerService struct { *Service parser ParserService config *CrawlerConfig httpClient *http.Client } // 生成随机字符串 func randomHex(n int) string { b := make([]byte, n) _, err := rand.Read(b) if err != nil { panic(err) } return hex.EncodeToString(b) } // 获取登录cookie func (service *crawlerService) GetLoginCookie(ctx context.Context) (string, error) { data := url.Values{} data.Set("username", service.config.Username) data.Set("password", service.config.Password) loginUrl := service.config.URL + "admin/signin" req, err := http.NewRequestWithContext(ctx, "POST", loginUrl, strings.NewReader(data.Encode())) if err != nil { return "", fmt.Errorf("操作失败: %v", err) } // 添加关键请求头 req.Header.Set("Content-Type", "application/x-www-form-urlencoded") req.Header.Set("Expect", "") req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36") req.Header.Set("Referer", loginUrl) req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") // 使用已有的HTTP客户端,但添加禁止重定向的选项 client := &http.Client{ // 复制原有客户端的Transport Transport: service.httpClient.Transport, // 添加禁止自动跳转的设置 CheckRedirect: func(req *http.Request, via []*http.Request) error { return http.ErrUseLastResponse // 禁止自动跳转 }, } resp, err := client.Do(req) if err != nil { return "", fmt.Errorf("%v", err) } defer resp.Body.Close() // 输出响应体,调试用 _, err = io.ReadAll(resp.Body) if err != nil { return "", fmt.Errorf("读取响应失败: %v", err) } // 提取原始 Header 中的 Set-Cookie 字段 rawCookies := resp.Header["Set-Cookie"] var cookieStr strings.Builder for _, cookie := range rawCookies { parts := strings.SplitN(cookie, ";", 2) if len(parts) > 0 { cookieStr.WriteString(parts[0] + "; ") } } cookieHeader := strings.TrimRight(cookieStr.String(), "; ") if cookieHeader == "" { return "", fmt.Errorf("获取 Cookie 失败") } return cookieHeader, nil } // 获取表单令牌 func (service *crawlerService) GetFormTokens(ctx context.Context, loginUrl string, cookieHeader string) (map[string]string, error) { req, err := http.NewRequestWithContext(ctx, "GET", loginUrl, nil) if err != nil { return nil, fmt.Errorf("创建请求失败: %v", err) } // 设置请求头,包括 Cookie 和 PJAX 头 req.Header.Set("Cookie", cookieHeader) req.Header.Set("X-PJAX", "true") req.Header.Set("X-PJAX-Container", "#pjax-container") req.Header.Set("X-Requested-With", "XMLHttpRequest") req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36") // 使用共享的HTTP客户端发起请求 resp, err := service.httpClient.Do(req) if err != nil { return nil, fmt.Errorf("请求失败: %v", err) } defer resp.Body.Close() // 使用 goquery 解析 HTML doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { return nil, fmt.Errorf("解析 HTML 失败: %v", err) } // 提取隐藏字段 previous := doc.Find(`input[name="__go_admin_previous_"]`).AttrOr("value", "默认值") t := doc.Find(`input[name="__go_admin_t_"]`).AttrOr("value", "默认值") return map[string]string{ "previous": previous, "t": t, }, nil } // 发送 POST 请求 func (service *crawlerService) SendFormData(ctx context.Context, url string, cookie string, formData map[string]interface{}) ([]byte, error) { var buf bytes.Buffer writer := multipart.NewWriter(&buf) // 遍历字段添加到 multipart 表单中 for key, val := range formData { valueStr := fmt.Sprintf("%v", val) // 转为字符串 if err := writer.WriteField(key, valueStr); err != nil { return nil, fmt.Errorf("写入字段失败: %v", err) } } // 关闭 writer 以完成结尾边界写入 if err := writer.Close(); err != nil { return nil, fmt.Errorf("关闭 multipart writer 失败: %v", err) } // 构造请求 req, err := http.NewRequestWithContext(ctx, "POST", url, &buf) if err != nil { return nil, fmt.Errorf("创建请求失败: %v", err) } // 设置请求头 req.Header.Set("Content-Type", writer.FormDataContentType()) req.Header.Set("X-PJAX", "true") req.Header.Set("X-PJAX-Container", "#pjax-container") req.Header.Set("X-Requested-With", "XMLHttpRequest") req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64)") req.Header.Set("Accept", "text/html, */*; q=0.01") req.Header.Set("Accept-Encoding", "gzip, deflate") req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7") req.Header.Set("Cookie", cookie) req.Header.Set("Expect", "") // 使用共享的HTTP客户端发起请求 resp, err := service.httpClient.Do(req) if err != nil { return nil, fmt.Errorf("请求发送失败: %v", err) } defer resp.Body.Close() // 读取响应 res, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("读取响应失败: %v", err) } return res, nil } // 获取 rule_id func (service *crawlerService) GetField(ctx context.Context, appName string) (map[string]interface{}, error) { keyURL := service.config.KeyURL + appName // 使用共享的HTTP客户端而不是http.Get req, err := http.NewRequestWithContext(ctx, "GET", keyURL, nil) if err != nil { return nil, fmt.Errorf("创建请求失败: %v", err) } resp, err := service.httpClient.Do(req) if err != nil { return nil, fmt.Errorf("请求失败:%w", err) } defer resp.Body.Close() body, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("读取响应体失败:%w", err) } // 先用 gjson 拿到 data.raw 对应的原始 JSON result := gjson.GetBytes(body, "data.raw") if !result.Exists() { return nil, fmt.Errorf("响应中缺少 data.raw") } // 再把这一段反序列化到 map var rawMap map[string]interface{} if err := json.Unmarshal([]byte(result.Raw), &rawMap); err != nil { return nil, fmt.Errorf("解析 data.raw 失败:%w", err) } return rawMap, nil } func (service *crawlerService) GetKey(ctx context.Context, appName string) (string, error) { // 使用共享的HTTP客户端而不是http.Get req, err := http.NewRequestWithContext(ctx, "GET", service.config.KeyURL+appName, nil) if err != nil { return "", fmt.Errorf("创建请求失败: %v", err) } resp, err := service.httpClient.Do(req) if err != nil { return "", fmt.Errorf("请求失败:%w", err) } defer resp.Body.Close() body, err := io.ReadAll(resp.Body) if err != nil { return "", fmt.Errorf("读取响应体失败:%w", err) } // 2. 直接从 JSON 路径 data.key 拿字符串 result := gjson.GetBytes(body, "data.key") if !result.Exists() { return "", fmt.Errorf("响应中缺少 data.key") } return result.String(), nil } func (service *crawlerService) DeleteRule(ctx context.Context, ruleID int, ruleUrl string) (string, error) { // 1. 登录,拿到 Cookie cookie, err := service.GetLoginCookie(ctx) if err != nil { return "", fmt.Errorf("login failed: %w", err) } // 2. 构造删除请求 URL 和表单 deleteURL := service.config.URL + ruleUrl formData := map[string]interface{}{ "id": ruleID, } // 3. 发表单(multipart 也支持 x-www-form-urlencoded,你这里用已有的 SendFormData) respBody, err := service.SendFormData(ctx, deleteURL, cookie, formData) if err != nil { return "", err } res, err := service.parser.GetMessage(ctx, respBody) if err != nil { return "", err } return res, nil } func (service *crawlerService) FetchPageContent(ctx context.Context, url string, cookie string) ([]byte, error) { fetchUrl := service.config.URL + url // 使用服务中的共享HTTP客户端 client := service.httpClient // 构造请求 req, err := http.NewRequestWithContext(ctx, "GET", fetchUrl, nil) if err != nil { return nil, fmt.Errorf("创建请求失败: %v", err) } // 设置请求头 req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8") req.Header.Set("Accept-Encoding", "gzip, deflate, br") req.Header.Set("Accept-Language", "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7") req.Header.Set("Cookie", cookie) // 发起请求 resp, err := client.Do(req) if err != nil { return nil, fmt.Errorf("请求发送失败: %v", err) } defer resp.Body.Close() // 检查响应状态码 if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("请求失败,状态码: %d", resp.StatusCode) } // 处理压缩响应 var reader io.Reader = resp.Body switch resp.Header.Get("Content-Encoding") { case "gzip": gzipReader, err := gzip.NewReader(resp.Body) if err != nil { return nil, fmt.Errorf("解压 gzip 响应失败: %v", err) } defer gzipReader.Close() reader = gzipReader case "deflate": reader = flate.NewReader(resp.Body) } // 读取响应内容 content, err := io.ReadAll(reader) if err != nil { return nil, fmt.Errorf("读取响应内容失败: %v", err) } return content, nil }