admin 管理员组文章数量: 887016
首先登陆以下网址进入腾讯招聘网https://hr.tencent/
首先分析以下网页数据加载的方式,是json数据还是动态数据或者是静态?
看下network里面抓到的动态数据是否有哪些有用的东西
发现抓到的json数据里面没有传输任何数据
页面请求也没有返回任何有关岗位的信息
import requests
from bs4 import BeautifulSoup
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
url = "https://hr.tencent/position.php?keywords=python"
response = requests.get(url, headers=headers, verify=False).text
print(response)
执行结果如下
"C:\Program Files\Python36\python.exe" C:/Users/40122/Desktop/demo_py3/day03/oop.py
C:\Program Files\Python36\lib\site-packages\urllib3\connectionpool.py:857: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings
InsecureRequestWarning)
<link media="screen" href="//cdn.m.tencent/hr_static/css/all.css?max_age=86412" type="text/css" rel="stylesheet" />
<script type="text/javascript" src="//cdn.m.tencent/hr_static/js/jquery-1.7.2.min.js"></script>
<script type="text/javascript" src="//cdn.m.tencent/hr_static/js/jquery-ui-1.7.2.custom.min.js"></script>
<script type="text/javascript" src="//cdn.m.tencent/hr_static/js/thickbox.js"></script>
<link media="screen" href="//cdn.m.tencent/hr_static/css/thickbox.css" type="text/css" rel="stylesheet" />
<script type="text/javascript" src="//cdn.m.tencent/hr_static/js/functions.js"></script>
<script type="text/javascript" src="//cdn.m.tencent/hr_static/js/utils.js"></script>
<script language="javascript" src="//vm.gtimg/tencentvideo/txp/js/txplayer.js" charset="utf-8"></script>
<div id="header">
<div class="maxwidth">
<a href="index.php" class="left" id="logo"><img src="//cdn.m.tencent/hr_static/img/logo.png"/></a>
<div class="right" id="headertr">
<div class="right pl9" id="topshares">
<div class="shares">
<span class="left">分享到:</span>
<!--<a href="javascript:;" onclick="shareto('qqt','top');" id="qqt" title="分享到腾讯微博">分享到腾讯微博</a>-->
<a href="javascript:;" onclick="shareto('qzone','top');" id="qzone" title="分享到QQ空间">分享到QQ空间</a>
<!--<a href="javascript:;" onclick="shareto('pengyou','top');" id="pengyou" title="分享到腾讯朋友">分享到腾讯朋友</a>-->
<a href="javascript:;" onclick="shareto('sinat','top');"id="sinat" title="分享到新浪微博">分享到新浪微博</a>
<!--<a href="javascript:;" onclick="shareto('renren','top');"id="renren" title="分享到人人网">分享到人人网</a>-->
<!--<a href="javascript:;" onclick="shareto('kaixin001','top');"id="kaixin" title="分享到开心网">分享到开心网</a>-->
<div class="clr"></div>
</div>
<!--<a href="javascript:;">分享</a>-->
</div>
<!--<div class="right pl9">-->
<!--<a href="http://t.qq/QQjobs" id="tqq" target="_blank">收听腾讯招聘</a>-->
<!--</div>-->
<div class="right pr9">
<a href="login.php" id="header_login_anchor">登录</a><span class="plr9">|</span><a href="reg.php">注册</a>
<span class="plr9">|</span><a href="question.php">反馈建议</a>
<span class="plr9">|</span><a href="http://careers.tencent/global" target="_blank">Tencent Global Talent</a>
<script>
var User_Account = "";
</script>
</div>
<div class="clr"></div>
</div>
<div class="clr"></div>
</div>
<div id="menus">
<div class="maxwidth">
<ul id="menu" class="left">
<li id="nav1" ><a href="index.php"> </a></li>
<li id="nav2" class="active" ><a href="social.php"> </a></li>
<li id="nav3"><a href="about.php"> </a></li>
<li id="nav4"><a href="workInTencent.php"> </a></li>
</ul>
<a class="right texti9" target="_blank" id="navxy" href="http://join.qq">校园招聘</a>
<div class="clr"></div>
</div>
</div>
<div id="homeDep"><table id="homeads"><tr><td align="center"><a href="http://tencent.avature/career" target="blank">全球招聘</a></td><td align="center"><a href="http://game.qq/hr/" target="blank">互动娱乐事业群招聘</a></td><td align="center"><a href="http://hr.tencent/position.php?lid=&tid=&keywords=WXG" target="blank">微信事业群招聘</a></td><td align="center"><a href="http://hr.qq/" target="blank">技术工程事业群招聘</a></td><td align="center"><a href="http://snghr.tencent" target="blank">社交网络事业群招聘</a></td><td align="center"><a href="http://mighr.qq" target="blank">移动互联网事业群招聘</a></td><td align="center"><a href="http://hr.tencent/position.php?keywords=OMG" target="blank">网络媒体事业群招聘</a></td></tr></table></div> <div id="footer">
<div>
<a href="http://www.tencent/" target="_blank">关于腾讯</a><span>|</span><a href="http://www.qq/contract.shtml" target="_blank">服务条款</a><span>|</span><a href="http://hr.tencent/" target="_blank">腾讯招聘</a><span>|</span><a href="http://careers.tencent/global" target="_blank">Tencent Global Talent</a><span>|</span><a href="http://gongyi.qq/" target="_blank">腾讯公益</a><span>|</span><a href="http://service.qq/" target="_blank">客服中心</a>
</div>
<p>Copyright © 1998 - 2018 Tencent. All Rights Reserved.</p>
</div>
</html>
Process finished with exit code 0
页面样式中看到数据都放tbody这个样式里面
再用bs4或者正则过滤下
import requests
from bs4 import BeautifulSoup
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
url = "https://hr.tencent/position.php?keywords=python"
response = requests.get(url, headers=headers, verify=False).text
soup = BeautifulSoup(response, 'lxml')
print(soup.find_all('table', class_='tablelist'))
数据一下子展示出来了。再用bs4过滤下
import requests
from bs4 import BeautifulSoup
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
url = "https://hr.tencent/position.php?keywords=python"
response = requests.get(url, headers=headers, verify=False).text
soup = BeautifulSoup(response, 'lxml')
jobList = soup.find_all('tr', class_=['even', 'odd'])
for job in jobList:
# tr:nth-child(2) > td.l.square > a
# 岗位名
jobName = job.select("td:nth-of-type(1) > a")[0].text
# url
joburl = "https://hr.tencent/" + job.select("td:nth-of-type(1) > a")[0]['href']
# 类型
jobType = job.select("td:nth-of-type(2)")[0].text
# 人数
jobnum = job.select("td:nth-of-type(3)")[0].text
# 地点
jobAddr = job.select("td:nth-of-type(4)")[0].text
print(jobName, joburl, jobType, jobnum, jobAddr)
第一页的数据已经抓下,
进去链接获取下岗位信息
可以通过嵌套一个requests请求获取到这些信息
再看下能获取到的其他信息
在点击页数信息得知是静态加载的数据,因为url都改变了。
还有需要知道总页数,这样循环即可抓到每个页面。network里面找不到其他相关的信息,发现每个页面信息都展示10条信息,分别在even和odd中
这下试下找招聘总数,这样就能获取到页数信息了
职位数量在一个span的标签中
开始写代码
import requests
from bs4 import BeautifulSoup
import math
def getJobOrder(url):
'''
获取岗位要求
:return:
'''
response = requests.get(url, headers=headers,verify=False).text
soup = BeautifulSoup(response, 'lxml')
# 岗位职责
# jobRes = soup.select('ul[class="squareli"]')
jobRes = soup.select("ul.squareli")[0].text
jobOrder = soup.select(".squareli")[1].text
# print(jobRes)
# print("=====",jobOrder)
return jobRes, jobOrder
def getJobInfo(url):
'''
获取岗位信息
:return:
'''
# url = "https://hr.tencent/position.php?lid=2218&tid=87&keywords=python&start=10#a"
response = requests.get(url, headers=headers,verify=False).text
# print(response)
soup = BeautifulSoup(response, 'lxml')
# job = soup.find_all('table', class_="tablelist")
jobList = soup.find_all('tr', class_=["even", 'odd']) # 或 [,]匹配所有符合条件的属性
for job in jobList:
# tr:nth-child(2) > td.l.square > a
# 岗位名
jobName = job.select("td:nth-of-type(1) > a")[0].text
# url
joburl = "https://hr.tencent/" + job.select("td:nth-of-type(1) > a")[0]['href']
# 类型
jonResAndOrder = getJobOrder(joburl)
# 职责
jobRes = jonResAndOrder[0]
# 要求
jobOrder = jonResAndOrder[1]
jobType = job.select("td:nth-of-type(2)")[0].text
# 人数
jobnum = job.select("td:nth-of-type(3)")[0].text
# 地点
jobAddr = job.select("td:nth-of-type(4)")[0].text
print(jobName, joburl, jobType, jobnum, jobAddr)
print(jobRes, jobOrder)
def getJobPageNum(url):
'''
获取岗位页数
:return:
'''
response = requests.get(url, headers=headers,verify=False).text
soup = BeautifulSoup(response, 'lxml')
num = soup.select('span[class="lightblue total"]')[0].text
print(num)
return int(num)
if __name__ == '__main__':
# getJobOrder("https://hr.tencent/position_detail.php?id=41998&keywords=python&tid=87&lid=2218")
# getJobInfo()
# 种子url
url = "https://hr.tencent/position.php?keywords=python"
pageNum = getJobPageNum(url)
# .ceil取上整
num = math.ceil(pageNum / 10)
for i in range(num):
url = "https://hr.tencent/position.php?keywords=python&start=%d#a" % (i * 10)
getJobInfo(url)
运行如下
如需要保存写入一个文件即可。
版权声明:本文为博主原创文章,未经博主允许不得转载。https://my.csdn/pangzhaowen
版权声明:本文标题:python爬虫小项目--抓取腾讯招聘岗位信息 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.freenas.com.cn/jishu/1727248955h1088663.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论