挖掘高校子域名

高校挖掘子域名,用到了爬虫,以及如何用python操作excal,涉及到多线程等等。

源代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import urllib.request
import re
import threading
import xlwt
import os
from os.path import join
# -*- coding: utf-8 -*-
# 正则匹配title
def carwl(html):
pattern = re.compile(r'<title>(.*?)</title>', re.I)
match = pattern.findall(html)
if match:
return match
# h 获取title内容
def get_title(url):
try:
r = urllib.request.urlopen('http://' + url, timeout=5)
html = r.read()
# 网页解码
try:
title = carwl(html.decode('utf-8'))
except:
title = carwl(html.decode('gbk'))
if (title):
ur.append(url)
ti.append(title)
return title
except:
print(' ')
# file = open('hndx.txt','r')
# for line in file:
# print(get_title(line))
# python操作excel
def p_excel(school_name):
f = xlwt.Workbook(encoding='utf-8')
sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
sheet01.write(0, 0, '二级域名')
sheet01.write(0, 1, '网站标题')
for i in range(len(ur)):
sheet01.write(i + 1, 0, ur[i])
sheet01.write(i + 1, 1, ti[i])
f.save(school_name+'.xls')
#读取目录内文件名
def batch_read_file(path):
a=os.listdir(path)
print(a)
return a
if __name__ == '__main__':
path = 'school_list/'
school_name_list = batch_read_file(path)
threads = []
for school_name in school_name_list:
ur = []
ti = []
file = open(path+'/'+school_name)
for line in file:
thread = threading.Thread(target=get_title, args=(line,))
thread.start()
threads.append(thread)
for t in threads:
t.join()
p_excel(school_name)
文章目录
  1. 1. 源代码: