'''
爬取个股历史数据
'''
import re
import threading
import requests
from SpiderUtil import *
stockUrl = "http://quote.eastmoney.com/stocklist.html"
downUrlStart = 'http://quotes.money.163.com/service/chddata.html?code='
downUrlEnd = '&end=20170830&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;TURNOVER;VOTURNOVER;VATURNOVER;TCAP;MCAP'
def getStockList(url):
html = requests.get(url).content.decode("gbk")
pattern = "<li><a.*>(\w*)\((\d{6})\)</a></li>"
reslist = re.findall(pattern, html)
thelist = reslist[:]
for item in reslist:
if not (item[1].startswith("6") or item[1].startswith("3") or item[1].startswith("0")):
thelist.remove(item)
return thelist
sem = threading.Semaphore(100)
def myDownloadFile(url, filepath):
with sem:
downloadFile(url, filepath)
if __name__ == "__main__":
slist = getStockList(stockUrl)
for i in range(len(slist)):
sname = slist[i][0]
scode = slist[i][1]
url = downUrlStart + ("0" if scode.startswith("6") else "1") + scode + downUrlEnd
filepath = "D:\PyDownload\csv\\" + (scode + "_" + sname) + ".csv"
threading.Thread(target=myDownloadFile, args=(url, filepath)).start()
print("main over")