import urllib2
import urllib
import re
import lxml
import lxml.etree
import multiprocessing
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36"}
def getJobNum(url):
request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request).read()
myre = "<em>(\d+)</em>"
regex = re.compile(myre)
res = regex.findall(response)
print res
return res[0]
def getWorkMessage(url):
request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request).read()
myre = "<!-- SWSStringCutStart -->[\s\S](.*)<!-- SWSStringCutEnd -->"
regex = re.compile(myre, re.S)
res = regex.findall(response)
return res[0].replace("\r\n", "").replace("<p>", "").replace("</p>", "").replace("<br/>", "")
def getJob(ulist, queue):
for url in ulist:
request = urllib2.Request(url, headers=header)
response = urllib2.urlopen(request).read()
mytree = lxml.etree.HTML(response)
joblist = mytree.xpath("//table[@class=\"newlist\"]")
print joblist
print len(joblist)
for job in joblist:
jobname = job.xpath("//tr/td[1]/div/a[1]/text()")
jobLink = job.xpath("//tr/td[1]/div/a[1]/@href")
companyName = job.xpath("//tr/td[3]/a[1]/text()")
salary = job.xpath("//tr/td[4]/text()")
workplace = job.xpath("//tr/td[5]/text()")
try:
for i in range(len(jobname)):
print jobname[i], getWorkMessage(jobLink[i]), companyName[i], salary[i], workplace[i]
except:
pass
return queue.put(1)
if __name__ == '__main__':
url = "https://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B7%B1%E5%9C%B3&kw=python&sm=0&p=1"
num = 2
page = 0
if num % 60 == 0:
page = num // 60
else:
page = num // 60 + 1
myList = ["https://sou.zhaopin.com/jobs/searchresult.ashx?jl=%E6%B7%B1%E5%9C%B3&kw=python&sm=0&p=" + str(i) for i in
range(1, page + 1)]
for i in myList:
print i
urllist = [[] for _ in range(10)]
N = len(urllist)
for i in range(len(myList)):
urllist[i % N].append(myList[i])
processList = []
queue = multiprocessing.Manager().Queue()
for ulist in urllist:
p = multiprocessing.Process(target=getJob, args=(ulist, queue))
p.start()
processList.append(p)
for t in processList:
t.join()
while not queue.empty():
print queue.get()