爬取51job上GIS岗位招聘信息

2018年4月25日 0 条评论 1.15k 次阅读 3 人点赞

开发语言

python


安装库

#安装requests和beautifulsoup4
>>> pip install requests
>>> pip install beautifulsoup4

代码:


#******表示百度地图API的密钥 import requests import json from bs4 import BeautifulSoup jobs_list=[] i=0 #用于计数 file=open('test.csv', 'a',encoding= 'utf-8') file.write("职位,公司,地址,x,y,最低工资,最高工资,职位描述,时间") for num in range(1,112): index_url="https://search.51job.com/list/000000,000000,0000,00,9,99,GIS,2,"+str(num)+".html" r=requests.get(index_url) r.encoding="gbk" job_html=r.text job_soup=BeautifulSoup(job_html,'lxml') jobs=job_soup.find("div",id="resultList").find_all('div',"el") for job in jobs[1:]: r_company=requests.get(job.find("span").find("a")["href"])#获得公司主页地址 r_company.encoding="gbk"#设置请求编码 company_html=r_company.text#获得公司主页html company_soup=BeautifulSoup(company_html,"lxml")#解析公司主页html try: addr=company_soup.find("div","bmsg inbox").find("p","fp").text.strip().replace("上班地址:","")#获得上班地址,有可能不存在 except AttributeError: addr=job.find("span","t3").text #如果不存在的话直接用列表中的地址代替 #调用百度API对地址进行编码,获得经纬度坐标 r_baidu=requests.get("http://api.map.baidu.com/geocoder/v2/?output=json&ak=************&address="+addr) r_baidu.encoding="gbk" baidu_dic=json.loads(r_baidu.text) try: y=baidu_dic["result"]["location"]["lng"] x=baidu_dic["result"]["location"]["lat"] except Exception as e: x=0 y=0 try: description=company_soup.find("div",class_="bmsg job_msg inbox").text.replace("\n\n\n\n分享\n\n\n微信\n邮件\n\n\n\n","").replace("\t","").replace("\r","").replace(",",",").replace("\n","/n") #获得职位描述 except Exception as e: description="无" salary_text=job.find("span","t4").text #获得薪水html:“4-6千/月”、“0.6-1万/月”、“11-12万/年” salary_min=4000#如果没指定最少则默认为4000 salary_max=4000#如果没指定最多则默认为6000 #将字符串转为数字型 if "千/月" in salary_text: salary_str=salary_text.replace("千/月","") salary_min=float(salary_str[:salary_str.index("-")])*1000 salary_max=float(salary_str[salary_str.index("-")+1:])*1000 elif "万/月" in salary_text: salary_str=salary_text.replace("万/月","") salary_min=float(salary_str[:salary_str.index("-")])*10000 salary_max=float(salary_str[salary_str.index("-")+1:])*10000 elif "万/年" in salary_text: salary_str=salary_text.replace("万/年","") salary_min=float(salary_str[:salary_str.index("-")])*10000/12 salary_max=float(salary_str[salary_str.index("-")+1:])*10000/12 ajob={ "job":job.find("span").text.strip(), "company":job.find("span","t2").text, "addr":addr, "addr_x":x, "addr_y":y, "salary_min":salary_min, "salary_max":salary_max, "description":description, "time":job.find("span","t5").text } jobs_list.append(ajob) print(str(i)+","+str(ajob)) i=i+1 write_text="\n"+ajob['job']+","+ajob['company']+","+ajob['addr']+","+str(ajob['addr_x'])+","+str(ajob['addr_y'])+","+str(ajob['salary_min'])+","+str(ajob['salary_max'])+","+ajob["description"]+","+ajob['time'] file.write(write_text) file.close()

分析结果

《从招聘数据看GIS就业》

肖大昕

这个人太懒什么东西都没留下

文章评论(0)