import urllib
from urllib import request
import lxml
import lxml.etree
import re
def makeurllist(url):
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0);"}
request = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(request).read()
mytree = lxml.etree.HTML(data)
mylist = mytree.xpath("//*[@class=\"dxypage clearfix\"]//text()")
lastmystr = mylist[0]
restr = u"页次:1/(\d+) 每页"
regex = re.compile(restr, re.IGNORECASE)
mylastlist = regex.findall(lastmystr)
numbers = eval(mylastlist[0])
urllist = []
for i in range(1, numbers + 1):
urllist.append("http://www.jb51.net/list/list_97_" + str(i) + ".htm")
return urllist
def gettitilefromurl(url):
lastlist = []
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0);"}
request = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(request).read()
mytree = lxml.etree.HTML(data)
urllist = mytree.xpath("//*[@class=\"artlist clearfix\"]/dl//dt/a/@href")
print(urllist)
titlelist = mytree.xpath("//*[@class=\"artlist clearfix\"]/dl//dt/a/@title")
print(titlelist)
for title in titlelist:
print(title)
print(gettitilefromurl("http://www.jb51.net/list/list_97_4.htm"))