import urllib2
import urllib
import re
import lxml
import lxml.etree
import threading
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36"}
def getJobNum(url):
request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request).read()
myre = "<em>(\d+)</em>"
regex = re.compile(myre)
res = regex.findall(response)
print res
return res[0]
def getWorkMessage(url):
request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request).read()
myre = "<!-- SWSStringCutStart -->[\s\S](.*)<!-- SWSStringCutEnd -->"
regex = re.compile(myre, re.S)
res = regex.findall(response)
return res[0].replace("\r\n", "").replace("<p>", "").replace("</p>", "").replace("<br/>", "")
def getJob(url):
request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request).read()
mytree = lxml.etree.HTML(response)
joblist = mytree.xpath("//table[@class=\"newlist\"]")
print joblist
print len(joblist)
jobname = []
jobLink = []
companyName = []
salary = []
workplace = []
for job in joblist:
jobname = job.xpath("//tr/td[1]/div/a[1]/text()")
jobLink = job.xpath("//tr/td[1]/div/a[1]/@href")
companyName = job.xpath("//tr/td[3]/a[1]/text()")
salary = job.xpath("//tr/td[4]/text()")
workplace = job.xpath("//tr/td[5]/text()")
try:
for i in range(len(jobname)):
print jobname[i], getWorkMessage(jobLink[i]), companyName[i], salary[i], workplace[i]
except:
pass
if __name__ == '__main__':
url = "https://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B7%B1%E5%9C%B3&kw=python&sm=0&p=1"
num = 2910
page = 0
if num % 60 == 0:
page = num // 60
else:
page = num // 60 + 1
myList = ["https://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B7%B1%E5%9C%B3&kw=python&sm=0&p=" + str(i) for i in
range(1, page + 1)]
for i in myList:
threading.Thread(target=getJob, args=(i,)).start()