暂无图片
暂无图片
暂无图片
暂无图片
暂无图片

golang源码分析:爬虫colly(part II)

      这里紧接着golang源码分析:爬虫colly(part I)继续讲解,我们看下colly最核心的文件colly.go

      H,colly.go 中首先定义了,爬虫开发中用到的hook回调的函数类型

    type CollectorOption func(*Collector)

    参数Collector结构体定义如下,定义了爬虫的各种参数,每一个参数对应的注释都非常详细,这里就不再翻译了

      type Collector struct {
      // UserAgent is the User-Agent string used by HTTP requests
      UserAgent string
      // MaxDepth limits the recursion depth of visited URLs.
      // Set it to 0 for infinite recursion (default).
      MaxDepth int
      // AllowedDomains is a domain whitelist.
      // Leave it blank to allow any domains to be visited
      AllowedDomains []string
      // DisallowedDomains is a domain blacklist.
      DisallowedDomains []string
      // DisallowedURLFilters is a list of regular expressions which restricts
      // visiting URLs. If any of the rules matches to a URL the
      // request will be stopped. DisallowedURLFilters will
      // be evaluated before URLFilters
      // Leave it blank to allow any URLs to be visited
      DisallowedURLFilters []*regexp.Regexp
      // URLFilters is a list of regular expressions which restricts
      // visiting URLs. If any of the rules matches to a URL the
      // request won't be stopped. DisallowedURLFilters will
      // be evaluated before URLFilters
      // Leave it blank to allow any URLs to be visited
      URLFilters []*regexp.Regexp
      // AllowURLRevisit allows multiple downloads of the same URL
      AllowURLRevisit bool
      // MaxBodySize is the limit of the retrieved response body in bytes.
      // 0 means unlimited.
      // The default value for MaxBodySize is 10MB (10 * 1024 * 1024 bytes).
      MaxBodySize int
      // CacheDir specifies a location where GET requests are cached as files.
      // When it's not defined, caching is disabled.
      CacheDir string
      // IgnoreRobotsTxt allows the Collector to ignore any restrictions set by
      // the target host's robots.txt file. See http://www.robotstxt.org/ for more
      // information.
      IgnoreRobotsTxt bool
      // Async turns on asynchronous network communication. Use Collector.Wait() to
      // be sure all requests have been finished.
      Async bool
      // ParseHTTPErrorResponse allows parsing HTTP responses with non 2xx status codes.
      // By default, Colly parses only successful HTTP responses. Set ParseHTTPErrorResponse
      // to true to enable it.
      ParseHTTPErrorResponse bool
      // ID is the unique identifier of a collector
      ID uint32
      // DetectCharset can enable character encoding detection for non-utf8 response bodies
      // without explicit charset declaration. This feature uses https://github.com/saintfish/chardet
      DetectCharset bool
      // RedirectHandler allows control on how a redirect will be managed
      // use c.SetRedirectHandler to set this value
      redirectHandler func(req *http.Request, via []*http.Request) error
      // CheckHead performs a HEAD request before every GET to pre-validate the response
      CheckHead bool
      // TraceHTTP enables capturing and reporting request performance for crawler tuning.
      // When set to true, the Response.Trace will be filled in with an HTTPTrace object.
      TraceHTTP bool
      // Context is the context that will be used for HTTP requests. You can set this
      // to support clean cancellation of scraping.
      Context context.Context




      store storage.Storage
      debugger debug.Debugger
      robotsMap map[string]*robotstxt.RobotsData
      htmlCallbacks []*htmlCallbackContainer
      xmlCallbacks []*xmlCallbackContainer
      requestCallbacks []RequestCallback
      responseCallbacks []ResponseCallback
      responseHeadersCallbacks []ResponseHeadersCallback
      errorCallbacks []ErrorCallback
      scrapedCallbacks []ScrapedCallback
      requestCount uint32
      responseCount uint32
      backend *httpBackend
      wg *sync.WaitGroup
      lock *sync.RWMutex
      }

      下面就是collect对应函数

                    func (c *Collector) Init() 
        func (c *Collector) Appengine(ctx context.Context)
        func (c *Collector) Visit(URL string) error
        func (c *Collector) HasVisited(URL string) (bool, error)
        func (c *Collector) HasPosted(URL string, requestData map[string]string) (bool, error)
        func (c *Collector) Head(URL string) error
        func (c *Collector) Request(method, URL string, requestData io.Reader, ctx *Context, hdr http.Header) error
        func (c *Collector) UnmarshalRequest(r []byte) (*Request, error)
        func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, checkRevisit bool) error
        func setRequestBody(req *http.Request, body io.Reader)
        func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, req *http.Request) error
        func (c *Collector) requestCheck(u string, parsedURL *url.URL, method string, requestData io.Reader, depth int, checkRevisit bool) error
        func (c *Collector) checkRobots(u *url.URL) error
        func (c *Collector) OnRequest(f RequestCallback){
        c.requestCallbacks = make([]RequestCallback, 0, 4)
        }
        func (c *Collector) OnResponseHeaders(f ResponseHeadersCallback){ 
        c.responseHeadersCallbacks = append(c.responseHeadersCallbacks, f)
        }
        func (c *Collector) handleOnRequest(r *Request) {
        if c.debugger != nil {
        c.debugger.Event(createEvent("request", r.ID, c.ID, map[string]string{
        "url": r.URL.String(),
        }))
        }
        for _, f := range c.requestCallbacks {
        f(r)
        }
        }
        func (c *Collector) handleOnHTML(resp *Response) error 

        其中我们经常用到的有下面几个:

          func (c *Collector) Visit(URL string) error {
          return c.scrape(URL, "GET", 1, nil, nil, nil, true)
          }

          这个是爬虫启动的入口,它调用了scrape函数

             func (c *Collector) scrape(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, checkRevisit bool) error{
            go c.fetch(u, method, depth, requestData, ctx, hdr, req)
            return c.fetch(u, method, depth, requestData, ctx, hdr, req)
            }

            它调用了fetch函数

              func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header, req *http.Request) error
              c.handleOnRequest(request)
              c.handleOnResponseHeaders(&Response{Ctx: ctx, Request: request, StatusCode: statusCode, Headers: &headers})
              err := c.handleOnError(response, err, request, ctx); err != nil
              c.handleOnResponse(response)
              err = c.handleOnHTML(response)
              c.handleOnScraped(response)

              在fetch函数中调用了我们注册的回调函数,这里就是hook点

              接下来定义了hook的一系列别名

                  // RequestCallback is a type alias for OnRequest callback functions
                type RequestCallback func(*Request)
                  // ResponseHeadersCallback is a type alias for OnResponseHeaders callback functions
                  type ResponseHeadersCallback func(*Response)
                    // ResponseCallback is a type alias for OnResponse callback functions
                    type ResponseCallback func(*Response)
                      // HTMLCallback is a type alias for OnHTML callback functions
                      type HTMLCallback func(*HTMLElement)
                        // XMLCallback is a type alias for OnXML callback functions
                        type XMLCallback func(*XMLElement)
                          // ErrorCallback is a type alias for OnError callback functions
                          type ErrorCallback func(*Response, error)
                            // ScrapedCallback is a type alias for OnScraped callback functions
                            type ScrapedCallback func(*Response)
                              // ProxyFunc is a type alias for proxy setter functions.
                              type ProxyFunc func(*http.Request) (*url.URL, error)

                              envMap存储了环境变量,也就是我们启动爬虫前的一些列设置

                                  var envMap = map[string]func(*Collector, string)
                                ALLOWED_DOMAINS
                                CACHE_DIR



                                在爬虫初始化的过程中,运行完optionsfunc 进行设置后,会解析这些环境变量

                                    func NewCollector(options ...CollectorOption) *Collector
                                  c.Init()
                                  for _, f := range options {
                                  f(c)
                                  }
                                  c.parseSettingsFromEnv()

                                  I,context.go定义了context

                                    type Context struct {
                                    contextMap map[string]interface{}
                                    lock *sync.RWMutex
                                    }

                                    J,htmlelement.go定义一些解析html常用的方法

                                      type HTMLElement struct {
                                      // Name is the name of the tag
                                      Name string
                                      Text string
                                      attributes []html.Attribute
                                      // Request is the request object of the element's HTML document
                                      Request *Request
                                      // Response is the Response object of the element's HTML document
                                      Response *Response
                                      // DOM is the goquery parsed DOM object of the page. DOM is relative
                                      // to the current HTMLElement
                                      DOM *goquery.Selection
                                      // Index stores the position of the current element within all the elements matched by an OnHTML callback
                                      Index int
                                      }
                                        func (h *HTMLElement) Attr(k string) string 
                                        func (h *HTMLElement) ChildText(goquerySelector string) string
                                        func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string

                                        K,http_backend.go定义了,用户设置的一些限制性条件

                                          type httpBackend struct {
                                          LimitRules []*LimitRule
                                          Client *http.Client
                                          lock *sync.RWMutex
                                          }
                                            type LimitRule struct {
                                            // DomainRegexp is a regular expression to match against domains
                                            DomainRegexp string
                                            // DomainGlob is a glob pattern to match against domains
                                            DomainGlob string
                                            // Delay is the duration to wait before creating a new request to the matching domains
                                            Delay time.Duration
                                            // RandomDelay is the extra randomized duration to wait added to Delay before creating a new request
                                            RandomDelay time.Duration
                                            // Parallelism is the number of the maximum allowed concurrent requests of the matching domains
                                            Parallelism int
                                            waitChan chan bool
                                            compiledRegexp *regexp.Regexp
                                            compiledGlob glob.Glob
                                            }
                                              func (r *LimitRule) Match(domain string) bool
                                              func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc) (*Response, error) 
                                              res, err := h.Client.Do(request)

                                              L,http_trace.go定义了trace相关的数据

                                                type HTTPTrace struct {
                                                start, connect time.Time
                                                ConnectDuration time.Duration
                                                FirstByteDuration time.Duration
                                                }

                                                M,request.go定义了请求相关的数据

                                                  type Request struct {
                                                  // URL is the parsed URL of the HTTP request
                                                  URL *url.URL
                                                  // Headers contains the Request's HTTP headers
                                                  Headers *http.Header
                                                  // Ctx is a context between a Request and a Response
                                                  Ctx *Context
                                                  // Depth is the number of the parents of the request
                                                  Depth int
                                                  // Method is the HTTP method of the request
                                                  Method string
                                                  // Body is the request body which is used on POST/PUT requests
                                                  Body io.Reader
                                                  // ResponseCharacterencoding is the character encoding of the response body.
                                                  // Leave it blank to allow automatic character encoding of the response body.
                                                  // It is empty by default and it can be set in OnRequest callback.
                                                  ResponseCharacterEncoding string
                                                  // ID is the Unique identifier of the request
                                                  ID uint32
                                                  collector *Collector
                                                  abort bool
                                                  baseURL *url.URL
                                                  // ProxyURL is the proxy address that handles the request
                                                  ProxyURL string
                                                  }

                                                  N,response.go定义了对应的响应

                                                    type Response struct {
                                                    // StatusCode is the status code of the Response
                                                    StatusCode int
                                                    // Body is the content of the Response
                                                    Body []byte
                                                    // Ctx is a context between a Request and a Response
                                                    Ctx *Context
                                                    // Request is the Request object of the response
                                                    Request *Request
                                                    // Headers contains the Response's HTTP headers
                                                    Headers *http.Header
                                                    // Trace contains the HTTPTrace for the request. Will only be set by the
                                                    // collector if Collector.TraceHTTP is set to true.
                                                    Trace *HTTPTrace
                                                    }

                                                    O,unmarshal.go定义了html的反序列化方法

                                                      func UnmarshalHTML(interface{}, s *goquery.Selection, structMap map[string]string) error 

                                                      P,xmlelement.go

                                                         type XMLElement struct {
                                                        // Name is the name of the tag
                                                        Name string
                                                        Text string
                                                        attributes interface{}
                                                        // Request is the request object of the element's HTML document
                                                        Request *Request
                                                        // Response is the Response object of the element's HTML document
                                                        Response *Response
                                                        // DOM is the DOM object of the page. DOM is relative
                                                        // to the current XMLElement and is either a html.Node or xmlquery.Node
                                                        // based on how the XMLElement was created.
                                                        DOM interface{}
                                                        isHTML bool
                                                        }
                                                          func (h *XMLElement) ChildText(xpathQuery string) string

                                                          总结下:colly一个爬虫基本的基本素:抓取数据的任务队列,抓去结果的解析,本地的存储。可以任务爬虫是一个更复杂的http客户端,但是colly通过options func 加事件 hook的方式,抽象简化了爬虫的逻辑,用可以很方便地定义可选参数和hook任务处理,快速地实现一个爬虫。


                                                          文章转载自golang算法架构leetcode技术php,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。

                                                          评论