import urllib.request
import re
import threading
import xlwt
import os
from os.path import join
def carwl(html):
pattern = re.compile(r'<title>(.*?)</title>', re.I)
match = pattern.findall(html)
if match:
return match
def get_title(url):
try:
r = urllib.request.urlopen('http://' + url, timeout=5)
html = r.read()
try:
title = carwl(html.decode('utf-8'))
except:
title = carwl(html.decode('gbk'))
if (title):
ur.append(url)
ti.append(title)
return title
except:
print(' ')
def p_excel(school_name):
f = xlwt.Workbook(encoding='utf-8')
sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
sheet01.write(0, 0, '二级域名')
sheet01.write(0, 1, '网站标题')
for i in range(len(ur)):
sheet01.write(i + 1, 0, ur[i])
sheet01.write(i + 1, 1, ti[i])
f.save(school_name+'.xls')
def batch_read_file(path):
a=os.listdir(path)
print(a)
return a
if __name__ == '__main__':
path = 'school_list/'
school_name_list = batch_read_file(path)
threads = []
for school_name in school_name_list:
ur = []
ti = []
file = open(path+'/'+school_name)
for line in file:
thread = threading.Thread(target=get_title, args=(line,))
thread.start()
threads.append(thread)
for t in threads:
t.join()
p_excel(school_name)