'''
爬取页面邮箱2.0
爬取当前页面和子页面的邮箱(深度爬取)
'''
import csv
import re
import threading
import time
from w4.day2.SpiderUtil import *
tempset = set()
def getPageEmails(url):
if url in tempset:
return
print("\n-----",url,"-----")
html = getHtml(url)
reslist = re.findall(PATTERN_EMAIL, html, flags=re.A)
writer = csv.writer(file)
for email in reslist:
print(email)
writer.writerow([email, time.ctime(), "未发送"])
def getEmailsInfinite(url):
html = getHtml(url)
getPageEmails(url)
tempset.add(url)
ulist = re.findall(PATTERN_URL, html)
for url in ulist:
if url not in tempset:
print("recursion")
getEmailsInfinite(url)
file = open(r"C:\Users\idea\Desktop\岛民邮箱.csv", "a+", newline="")
if __name__ == "__main__":
getEmailsInfinite("http://www.baidu.com/s?wd=%E5%B2%9B%E5%9B%BD%20%E9%82%AE%E7%AE%B1")
print("main over")