admin 管理员组文章数量: 887021
一、centos7安装chrome
[root@localhost ~]# echo '[google-chrome]
name=google-chrome
baseurl=http://dl.google/linux/chrome/rpm/stable/x86_64
enabled=1
gpgcheck=1
gpgkey=https://dl.google/linux/linux_signing_key.pub
'> /etc/yum.repos.d/google-chrome.repo
[root@localhost ~]# yum -y install google-chrome-stable --nogpgcheck
二、查看chrome版本
[root@localhost ~]# google-chrome-stable --no-sandbox
三、下载chromedriver
到该网站 http://npm.taobao/mirrors/chromedriver/
下载对应版本并解压到/root下
四、代码
package main
import (
"context"
"fmt"
"log"
"time"
"github/tebeka/selenium"
"github/tebeka/selenium/chrome"
)
const (
//设置常量 分别设置chromedriver.exe的地址和本地调用端口
seleniumPath = `/home/chromedriver`
port = 9515
)
var (
chromeCaps = chrome.Capabilities{
Prefs: map[string]interface{}{ // 禁止加载图片,加快渲染速度
"profile.managed_default_content_settings.images": 2,
},
Path: "",
Args: []string{
// "--headless",
"--start-maximized",
"--window-size=1920x1080",
"--no-sandbox",
"--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
"--disable-gpu",
"--disable-impl-side-painting",
"--disable-gpu-sandbox",
"--disable-accelerated-2d-canvas",
"--disable-accelerated-jpeg-decoding",
"--test-type=ui",
"--ignore-certificate-errors",
},
}
//设置selenium服务的选项,设置为空。根据需要设置。
ops = []selenium.ServiceOption{}
service *selenium.Service
//设置浏览器兼容性,设置浏览器名称为chrome
caps = selenium.Capabilities{"browserName": "chrome"}
)
// Init 初始化一个service后台服务
func InitService() (*selenium.Service, error) {
//1.开启selenium服务
return selenium.NewChromeDriverService(seleniumPath, port, ops...)
}
// [+] 遍历index下标, 一页页抓取文章url---------------------------------------------------------------------------------------------
func Spider(parse ParseFactory) (err error) {
//1.加载自定义浏览器配置
caps.AddChrome(chromeCaps)
//2.将浏览器挂载到selenium driver上, 调用浏览器urlPrefix: 测试参考:DefaultURLPrefix = "http://127.0.0.1:4444/wd/hub"
wd, err := selenium.NewRemote(caps, fmt.Sprintf("http://127.0.0.1:%v/wd/hub", port))
if err != nil {
err = fmt.Errorf("unable create browser, err: %v", err)
return
}
defer wd.Close()
if err = parse.Run(wd); err != nil {
log.Fatal(1001, err)
}
return
}
func schedule(links []string) {
var worker1 = make(chan bool, 1)
var worker2 = make(chan bool, 1)
var worker3 = make(chan bool, 1)
var worker4 = make(chan bool, 1)
worker1 <- true
worker2 <- true
worker3 <- true
worker4 <- true
n := len(links) / 4
for {
select {
case <-worker1:
go func() {
Spider(&ReadArticalParser{ArticleLink: links[:n]})
worker1 <- true
}()
case <-worker2:
go func() {
Spider(&ReadArticalParser{ArticleLink: links[n : n*2]})
worker2 <- true
}()
case <-worker3:
go func() {
Spider(&ReadArticalParser{ArticleLink: links[n*2 : n*3]})
worker3 <- true
}()
case <-worker4:
go func() {
Spider(&ReadArticalParser{ArticleLink: links[n*3:]})
worker4 <- true
}()
default:
time.Sleep(time.Second * 3)
}
}
}
func main() {
service, err := InitService()
if err != nil {
log.Fatal(0001, err)
}
defer service.Stop()
// 获取所有文章链接
linksParser := &LinksBuildParser{BaseURL: "https://blog.csdn/qq_38900565?t=1"}
if err := Spider(linksParser); err != nil {
log.Fatal(0002, err)
}
schedule(linksParser.ArticleLink)
}
// [+] 工厂方法封装爬虫器 ------------------------------------------------------------------------------------------------------------
type ParseFactory interface {
Run(selenium.WebDriver) error
}
// [-] LinksBuildParser 链接爬取器
type LinksBuildParser struct {
BaseURL string
ArticleLink []string
}
func (l *LinksBuildParser) Run(wd selenium.WebDriver) (err error) {
//1.打开根页
if err = wd.Get(l.BaseURL); err != nil {
return
}
for {
// 4.抓取当前文章
l.ParseLink(wd)
// 5.点击下一页
e, err := wd.FindElement(selenium.ByCSSSelector, "[class='js-page-next js-page-action ui-pager']")
if err != nil {
break
}
e.Click()
}
return nil
}
// 抓取当前页面链接
// type ParseLink func(selenium.WebDriver)
func (l *LinksBuildParser) ParseLink(wd selenium.WebDriver) {
eles, err := wd.FindElements(selenium.ByXPATH, `//*[@id="articleMeList-blog"]/div[2]/div[@class='article-item-box csdn-tracking-statistics']`)
if err != nil {
log.Println(2001, err)
return
}
for _, ele := range eles {
readE, err := ele.FindElement(selenium.ByXPATH, "div[@class='info-box d-flex align-content-center']/p/span[@class='read-num']")
text, err := readE.Text()
if err != nil {
log.Println(2002, err)
continue
}
if len([]byte(text)) > 4 {
log.Println(2003, "阅读过万", text)
continue
}
a, err := ele.FindElement(selenium.ByXPATH, "h4/a")
if err != nil {
log.Println(2004, err)
continue
}
if link, err := a.GetAttribute("href"); err == nil {
l.ArticleLink = append(l.ArticleLink, link)
}
}
}
// [-] 文章阅读器
type ReadArticalParser struct {
ArticleLink []string
}
func (r *ReadArticalParser) Run(wd selenium.WebDriver) (err error) {
log.Println(3000, wd.SetPageLoadTimeout(time.Second*15))
for _, link := range r.ArticleLink {
// ctx, cancel := context.WithTimeout(context.Background(), time.Second*15)
// go r.Read(ctx, wd, link)
// time.Sleep(time.Second * 15)
// cancel()
if err := wd.Get(link); err != nil {
log.Println(3001, err)
}
time.Sleep(time.Second * 15)
}
return nil
}
func (r *ReadArticalParser) Read(ctx context.Context, wd selenium.WebDriver, link string) {
quit := make(chan bool, 1)
go func() {
if err := wd.Get(link); err != nil {
log.Println(3002, err)
}
quit <- true
}()
for {
select {
case <-ctx.Done():
return
case <-quit:
return
default:
time.Sleep(time.Second)
}
}
}
本文标签: selenium golang URL 逐页爬取
版权声明:本文标题:golang使用selenium逐页爬取url 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.freenas.com.cn/jishu/1726436794h960374.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论