|
@@ -13,7 +13,6 @@ import (
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/spf13/viper"
|
|
"github.com/spf13/viper"
|
|
"github.com/tidwall/gjson"
|
|
"github.com/tidwall/gjson"
|
|
- "go.uber.org/zap"
|
|
|
|
"io"
|
|
"io"
|
|
"mime/multipart"
|
|
"mime/multipart"
|
|
"net/http"
|
|
"net/http"
|
|
@@ -44,6 +43,17 @@ func NewCrawlerService(
|
|
parser ParserService,
|
|
parser ParserService,
|
|
conf *viper.Viper,
|
|
conf *viper.Viper,
|
|
) CrawlerService {
|
|
) CrawlerService {
|
|
|
|
+ // 创建一个全局HTTP客户端,用于复用连接池
|
|
|
|
+ httpClient := &http.Client{
|
|
|
|
+ Transport: &http.Transport{
|
|
|
|
+ TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
|
|
|
+ MaxIdleConns: 100, // 最大空闲连接数
|
|
|
|
+ MaxIdleConnsPerHost: 100, // 每个主机的最大空闲连接数
|
|
|
|
+ IdleConnTimeout: 90 * time.Second, // 空闲连接超时时间
|
|
|
|
+ },
|
|
|
|
+ Timeout: 30 * time.Second, // 请求超时时间
|
|
|
|
+ }
|
|
|
|
+
|
|
return &crawlerService{
|
|
return &crawlerService{
|
|
Service: service,
|
|
Service: service,
|
|
parser: parser,
|
|
parser: parser,
|
|
@@ -53,13 +63,15 @@ func NewCrawlerService(
|
|
URL: conf.GetString("crawler.Url"),
|
|
URL: conf.GetString("crawler.Url"),
|
|
KeyURL: conf.GetString("crawler.keyUrl"),
|
|
KeyURL: conf.GetString("crawler.keyUrl"),
|
|
},
|
|
},
|
|
|
|
+ httpClient: httpClient,
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
type crawlerService struct {
|
|
type crawlerService struct {
|
|
*Service
|
|
*Service
|
|
- parser ParserService
|
|
|
|
- config *CrawlerConfig
|
|
|
|
|
|
+ parser ParserService
|
|
|
|
+ config *CrawlerConfig
|
|
|
|
+ httpClient *http.Client
|
|
}
|
|
}
|
|
|
|
|
|
// 生成随机字符串
|
|
// 生成随机字符串
|
|
@@ -82,7 +94,6 @@ func (service *crawlerService) GetLoginCookie(ctx context.Context) (string, erro
|
|
if err != nil {
|
|
if err != nil {
|
|
return "", fmt.Errorf("操作失败: %v", err)
|
|
return "", fmt.Errorf("操作失败: %v", err)
|
|
}
|
|
}
|
|
- service.logger.WithValue(ctx, zap.Time("time0========================", time.Now()))
|
|
|
|
// 添加关键请求头
|
|
// 添加关键请求头
|
|
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
|
|
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
|
|
req.Header.Set("Expect", "")
|
|
req.Header.Set("Expect", "")
|
|
@@ -90,13 +101,14 @@ func (service *crawlerService) GetLoginCookie(ctx context.Context) (string, erro
|
|
req.Header.Set("Referer", loginUrl)
|
|
req.Header.Set("Referer", loginUrl)
|
|
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
|
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
|
|
|
|
|
|
|
+ // 使用已有的HTTP客户端,但添加禁止重定向的选项
|
|
client := &http.Client{
|
|
client := &http.Client{
|
|
|
|
+ // 复制原有客户端的Transport
|
|
|
|
+ Transport: service.httpClient.Transport,
|
|
|
|
+ // 添加禁止自动跳转的设置
|
|
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
|
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
|
return http.ErrUseLastResponse // 禁止自动跳转
|
|
return http.ErrUseLastResponse // 禁止自动跳转
|
|
},
|
|
},
|
|
- Transport: &http.Transport{
|
|
|
|
- TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
|
|
|
- },
|
|
|
|
}
|
|
}
|
|
|
|
|
|
resp, err := client.Do(req)
|
|
resp, err := client.Do(req)
|
|
@@ -104,7 +116,6 @@ func (service *crawlerService) GetLoginCookie(ctx context.Context) (string, erro
|
|
return "", fmt.Errorf("%v", err)
|
|
return "", fmt.Errorf("%v", err)
|
|
}
|
|
}
|
|
defer resp.Body.Close()
|
|
defer resp.Body.Close()
|
|
- service.logger.WithValue(ctx, zap.Time("time1===================================", time.Now()))
|
|
|
|
// 输出响应体,调试用
|
|
// 输出响应体,调试用
|
|
_, err = io.ReadAll(resp.Body)
|
|
_, err = io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
if err != nil {
|
|
@@ -120,7 +131,6 @@ func (service *crawlerService) GetLoginCookie(ctx context.Context) (string, erro
|
|
cookieStr.WriteString(parts[0] + "; ")
|
|
cookieStr.WriteString(parts[0] + "; ")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
- service.logger.WithValue(ctx, zap.Time("time2=====================================", time.Now()))
|
|
|
|
cookieHeader := strings.TrimRight(cookieStr.String(), "; ")
|
|
cookieHeader := strings.TrimRight(cookieStr.String(), "; ")
|
|
|
|
|
|
if cookieHeader == "" {
|
|
if cookieHeader == "" {
|
|
@@ -143,15 +153,8 @@ func (service *crawlerService) GetFormTokens(ctx context.Context, loginUrl strin
|
|
req.Header.Set("X-Requested-With", "XMLHttpRequest")
|
|
req.Header.Set("X-Requested-With", "XMLHttpRequest")
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36")
|
|
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36")
|
|
|
|
|
|
- // HTTP 客户端,跳过 SSL 验证
|
|
|
|
- client := &http.Client{
|
|
|
|
- Transport: &http.Transport{
|
|
|
|
- TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
|
|
|
- },
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- // 发送请求
|
|
|
|
- resp, err := client.Do(req)
|
|
|
|
|
|
+ // 使用共享的HTTP客户端发起请求
|
|
|
|
+ resp, err := service.httpClient.Do(req)
|
|
if err != nil {
|
|
if err != nil {
|
|
return nil, fmt.Errorf("请求失败: %v", err)
|
|
return nil, fmt.Errorf("请求失败: %v", err)
|
|
}
|
|
}
|
|
@@ -206,15 +209,8 @@ func (service *crawlerService) SendFormData(ctx context.Context, url string, coo
|
|
req.Header.Set("Cookie", cookie)
|
|
req.Header.Set("Cookie", cookie)
|
|
req.Header.Set("Expect", "")
|
|
req.Header.Set("Expect", "")
|
|
|
|
|
|
- // 跳过 SSL 验证的 HTTP 客户端
|
|
|
|
- client := &http.Client{
|
|
|
|
- Transport: &http.Transport{
|
|
|
|
- TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
|
|
|
- },
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- // 发起请求
|
|
|
|
- resp, err := client.Do(req)
|
|
|
|
|
|
+ // 使用共享的HTTP客户端发起请求
|
|
|
|
+ resp, err := service.httpClient.Do(req)
|
|
if err != nil {
|
|
if err != nil {
|
|
return nil, fmt.Errorf("请求发送失败: %v", err)
|
|
return nil, fmt.Errorf("请求发送失败: %v", err)
|
|
}
|
|
}
|
|
@@ -232,7 +228,12 @@ func (service *crawlerService) SendFormData(ctx context.Context, url string, coo
|
|
// 获取 rule_id
|
|
// 获取 rule_id
|
|
func (service *crawlerService) GetField(ctx context.Context, appName string) (map[string]interface{}, error) {
|
|
func (service *crawlerService) GetField(ctx context.Context, appName string) (map[string]interface{}, error) {
|
|
keyURL := service.config.KeyURL + appName
|
|
keyURL := service.config.KeyURL + appName
|
|
- resp, err := http.Get(keyURL)
|
|
|
|
|
|
+ // 使用共享的HTTP客户端而不是http.Get
|
|
|
|
+ req, err := http.NewRequestWithContext(ctx, "GET", keyURL, nil)
|
|
|
|
+ if err != nil {
|
|
|
|
+ return nil, fmt.Errorf("创建请求失败: %v", err)
|
|
|
|
+ }
|
|
|
|
+ resp, err := service.httpClient.Do(req)
|
|
if err != nil {
|
|
if err != nil {
|
|
return nil, fmt.Errorf("请求失败:%w", err)
|
|
return nil, fmt.Errorf("请求失败:%w", err)
|
|
}
|
|
}
|
|
@@ -258,7 +259,12 @@ func (service *crawlerService) GetField(ctx context.Context, appName string) (ma
|
|
}
|
|
}
|
|
|
|
|
|
func (service *crawlerService) GetKey(ctx context.Context, appName string) (string, error) {
|
|
func (service *crawlerService) GetKey(ctx context.Context, appName string) (string, error) {
|
|
- resp, err := http.Get(service.config.KeyURL + appName)
|
|
|
|
|
|
+ // 使用共享的HTTP客户端而不是http.Get
|
|
|
|
+ req, err := http.NewRequestWithContext(ctx, "GET", service.config.KeyURL+appName, nil)
|
|
|
|
+ if err != nil {
|
|
|
|
+ return "", fmt.Errorf("创建请求失败: %v", err)
|
|
|
|
+ }
|
|
|
|
+ resp, err := service.httpClient.Do(req)
|
|
if err != nil {
|
|
if err != nil {
|
|
return "", fmt.Errorf("请求失败:%w", err)
|
|
return "", fmt.Errorf("请求失败:%w", err)
|
|
}
|
|
}
|
|
@@ -306,16 +312,8 @@ func (service *crawlerService) DeleteRule(ctx context.Context, ruleID int, ruleU
|
|
|
|
|
|
func (service *crawlerService) FetchPageContent(ctx context.Context, url string, cookie string) ([]byte, error) {
|
|
func (service *crawlerService) FetchPageContent(ctx context.Context, url string, cookie string) ([]byte, error) {
|
|
fetchUrl := service.config.URL + url
|
|
fetchUrl := service.config.URL + url
|
|
- // 配置 HTTP 客户端
|
|
|
|
- client := &http.Client{
|
|
|
|
- Transport: &http.Transport{
|
|
|
|
- TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
|
|
|
- MaxIdleConns: 100,
|
|
|
|
- MaxIdleConnsPerHost: 100,
|
|
|
|
- IdleConnTimeout: 90 * time.Second,
|
|
|
|
- },
|
|
|
|
- Timeout: 30 * time.Second,
|
|
|
|
- }
|
|
|
|
|
|
+ // 使用服务中的共享HTTP客户端
|
|
|
|
+ client := service.httpClient
|
|
|
|
|
|
// 构造请求
|
|
// 构造请求
|
|
req, err := http.NewRequestWithContext(ctx, "GET", fetchUrl, nil)
|
|
req, err := http.NewRequestWithContext(ctx, "GET", fetchUrl, nil)
|