admin 管理员组

文章数量: 887021

一、centos7安装chrome

[root@localhost ~]# echo '[google-chrome]
name=google-chrome
baseurl=http://dl.google/linux/chrome/rpm/stable/x86_64
enabled=1
gpgcheck=1
gpgkey=https://dl.google/linux/linux_signing_key.pub
'> /etc/yum.repos.d/google-chrome.repo
[root@localhost ~]# yum -y install google-chrome-stable --nogpgcheck

二、查看chrome版本

[root@localhost ~]# google-chrome-stable --no-sandbox

三、下载chromedriver
到该网站 http://npm.taobao/mirrors/chromedriver/下载对应版本并解压到/root下

四、代码

package main

import (
	"context"
	"fmt"
	"log"
	"time"

	"github/tebeka/selenium"
	"github/tebeka/selenium/chrome"
)

const (
	//设置常量 分别设置chromedriver.exe的地址和本地调用端口
	seleniumPath = `/home/chromedriver`
	port         = 9515
)

var (
	chromeCaps = chrome.Capabilities{
		Prefs: map[string]interface{}{ // 禁止加载图片,加快渲染速度
			"profile.managed_default_content_settings.images": 2,
		},
		Path: "",
		Args: []string{
			// "--headless",
			"--start-maximized",
			"--window-size=1920x1080",
			"--no-sandbox",
			"--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
			"--disable-gpu",
			"--disable-impl-side-painting",
			"--disable-gpu-sandbox",
			"--disable-accelerated-2d-canvas",
			"--disable-accelerated-jpeg-decoding",
			"--test-type=ui",
			"--ignore-certificate-errors",
		},
	}
	//设置selenium服务的选项,设置为空。根据需要设置。
	ops     = []selenium.ServiceOption{}
	service *selenium.Service
	//设置浏览器兼容性,设置浏览器名称为chrome
	caps = selenium.Capabilities{"browserName": "chrome"}
)

// Init 初始化一个service后台服务
func InitService() (*selenium.Service, error) {
	//1.开启selenium服务
	return selenium.NewChromeDriverService(seleniumPath, port, ops...)
}

// [+] 遍历index下标, 一页页抓取文章url---------------------------------------------------------------------------------------------
func Spider(parse ParseFactory) (err error) {
	//1.加载自定义浏览器配置
	caps.AddChrome(chromeCaps)
	//2.将浏览器挂载到selenium driver上, 调用浏览器urlPrefix: 测试参考:DefaultURLPrefix = "http://127.0.0.1:4444/wd/hub"
	wd, err := selenium.NewRemote(caps, fmt.Sprintf("http://127.0.0.1:%v/wd/hub", port))
	if err != nil {
		err = fmt.Errorf("unable create browser, err: %v", err)
		return
	}
	defer wd.Close()
	if err = parse.Run(wd); err != nil {
		log.Fatal(1001, err)
	}
	return
}

func schedule(links []string) {
	var worker1 = make(chan bool, 1)
	var worker2 = make(chan bool, 1)
	var worker3 = make(chan bool, 1)
	var worker4 = make(chan bool, 1)
	worker1 <- true
	worker2 <- true
	worker3 <- true
	worker4 <- true
	n := len(links) / 4
	for {
		select {
		case <-worker1:
			go func() {
				Spider(&ReadArticalParser{ArticleLink: links[:n]})
				worker1 <- true
			}()
		case <-worker2:
			go func() {
				Spider(&ReadArticalParser{ArticleLink: links[n : n*2]})
				worker2 <- true
			}()
		case <-worker3:
			go func() {
				Spider(&ReadArticalParser{ArticleLink: links[n*2 : n*3]})
				worker3 <- true
			}()
		case <-worker4:
			go func() {
				Spider(&ReadArticalParser{ArticleLink: links[n*3:]})
				worker4 <- true
			}()
		default:
			time.Sleep(time.Second * 3)
		}
	}
}

func main() {
	service, err := InitService()
	if err != nil {
		log.Fatal(0001, err)
	}
	defer service.Stop()
	// 获取所有文章链接
	linksParser := &LinksBuildParser{BaseURL: "https://blog.csdn/qq_38900565?t=1"}
	if err := Spider(linksParser); err != nil {
		log.Fatal(0002, err)
	}
	schedule(linksParser.ArticleLink)
}

// [+] 工厂方法封装爬虫器 ------------------------------------------------------------------------------------------------------------
type ParseFactory interface {
	Run(selenium.WebDriver) error
}

// [-] LinksBuildParser 链接爬取器
type LinksBuildParser struct {
	BaseURL     string
	ArticleLink []string
}

func (l *LinksBuildParser) Run(wd selenium.WebDriver) (err error) {
	//1.打开根页
	if err = wd.Get(l.BaseURL); err != nil {
		return
	}
	for {
		// 4.抓取当前文章
		l.ParseLink(wd)
		// 5.点击下一页
		e, err := wd.FindElement(selenium.ByCSSSelector, "[class='js-page-next js-page-action ui-pager']")
		if err != nil {
			break
		}
		e.Click()
	}
	return nil
}

// 抓取当前页面链接
// type ParseLink func(selenium.WebDriver)
func (l *LinksBuildParser) ParseLink(wd selenium.WebDriver) {
	eles, err := wd.FindElements(selenium.ByXPATH, `//*[@id="articleMeList-blog"]/div[2]/div[@class='article-item-box csdn-tracking-statistics']`)
	if err != nil {
		log.Println(2001, err)
		return
	}
	for _, ele := range eles {
		readE, err := ele.FindElement(selenium.ByXPATH, "div[@class='info-box d-flex align-content-center']/p/span[@class='read-num']")
		text, err := readE.Text()
		if err != nil {
			log.Println(2002, err)
			continue
		}
		if len([]byte(text)) > 4 {
			log.Println(2003, "阅读过万", text)
			continue
		}
		a, err := ele.FindElement(selenium.ByXPATH, "h4/a")
		if err != nil {
			log.Println(2004, err)
			continue
		}
		if link, err := a.GetAttribute("href"); err == nil {
			l.ArticleLink = append(l.ArticleLink, link)
		}

	}
}

// [-] 文章阅读器
type ReadArticalParser struct {
	ArticleLink []string
}

func (r *ReadArticalParser) Run(wd selenium.WebDriver) (err error) {
	log.Println(3000, wd.SetPageLoadTimeout(time.Second*15))
	for _, link := range r.ArticleLink {
		// ctx, cancel := context.WithTimeout(context.Background(), time.Second*15)
		// go r.Read(ctx, wd, link)
		// time.Sleep(time.Second * 15)
		// cancel()
		if err := wd.Get(link); err != nil {
			log.Println(3001, err)
		}
		time.Sleep(time.Second * 15)
	}
	return nil
}

func (r *ReadArticalParser) Read(ctx context.Context, wd selenium.WebDriver, link string) {
	quit := make(chan bool, 1)
	go func() {
		if err := wd.Get(link); err != nil {
			log.Println(3002, err)
		}
		quit <- true
	}()
	for {
		select {
		case <-ctx.Done():
			return
		case <-quit:
			return
		default:
			time.Sleep(time.Second)
		}
	}
}

本文标签: selenium golang URL 逐页爬取