import urllib2
import urllib
import re
import lxml
import lxml.etree
import threading
rlock = threading.RLock()
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36"}
def getJobNum(url):
request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request).read()
myre = "<em>(\d+)</em>"
regex = re.compile(myre)
res = regex.findall(response)
print res
return res[0]
def getWorkMessage(url):
request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request).read()
myre = "<!-- SWSStringCutStart -->[\s\S](.*)<!-- SWSStringCutEnd -->"
regex = re.compile(myre, re.S)
res = regex.findall(response)
return res[0].replace("\r\n", "").replace("<p>", "").replace("</p>", "").replace("<br/>", "")
def getJob(ulist):
for url in ulist:
request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request).read()
mytree = lxml.etree.HTML(response)
joblist = mytree.xpath("//table[@class=\"newlist\"]")
print joblist
print len(joblist)
for job in joblist:
jobname = job.xpath("//tr/td[1]/div/a[1]/text()")
jobLink = job.xpath("//tr/td[1]/div/a[1]/@href")
companyName = job.xpath("//tr/td[3]/a[1]/text()")
salary = job.xpath("//tr/td[4]/text()")
workplace = job.xpath("//tr/td[5]/text()")
try:
for i in range(len(jobname)):
print jobname[i], getWorkMessage(jobLink[i]), companyName[i], salary[i], workplace[i]
except:
pass
if __name__ == '__main__':
url = "https://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B7%B1%E5%9C%B3&kw=python&sm=0&p=1"
num = 2910
page = 0
if num % 60 == 0:
page = num // 60
else:
page = num // 60 + 1
myList = ["https://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B7%B1%E5%9C%B3&kw=python&sm=0&p=" + str(i) for i in
range(1, page + 1)]
for i in myList:
print i
urllist = [[] for _ in range(10)]
N = len(urllist)
for i in range(len(myList)):
urllist[i % N].append(myList[i])
ThreadList = []
for ulist in urllist:
t = threading.Thread(target=getJob, args=(ulist,))
t.start()
ThreadList.append(t)
for t in ThreadList:
t.join()