-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcrawler.py
More file actions
executable file
·72 lines (63 loc) · 2.33 KB
/
crawler.py
File metadata and controls
executable file
·72 lines (63 loc) · 2.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env python3
# Foundations of Python Network Programming, Third Edition
# https://github.com/chenuin/ND_hw3/blob/master/crawler.py
# Extract the email addresses from all the web pages reachable from that web site
import argparse, requests
import urllib.request,re
from urllib.parse import urljoin, urlsplit
from lxml import etree
def GET(url):
response = requests.get(url)
if response.headers.get('Content-Type', '').split(';')[0] != 'text/html':
return
text = response.text
try:
html = etree.HTML(text)
except Exception as e:
print(' {}: {}'.format(e.__class__.__name__, e))
return
links = html.findall('.//a[@href]')
time2 = 0
for link in links:
yield GET, urljoin(url, link.attrib['href'])
def scrape(start, url_filter, numEXECUTE):
further_work = {start}
already_seen = {start}
time = 0
while further_work:
call_tuple = further_work.pop()
function, url, *etc = call_tuple
print(function.__name__, url, *etc)
try:
tmp = str.encode(url)
f = urllib.request.urlopen(url)
s = f.read()
mail = []
mail = re.findall(b"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}",s)
print(*mail, sep='\n')
except Exception as e:
print(' {}: {}'.format(e.__class__.__name__, e))
for call_tuple in function(url, *etc):
if call_tuple in already_seen:
continue
already_seen.add(call_tuple)
function, url, *etc = call_tuple
if not url_filter(url):
continue
further_work.add(call_tuple)
time = time + 1
#print (numEXECUTE)
if time > numEXECUTE:
exit()
def main(GET):
parser = argparse.ArgumentParser(description='Scrape a simple site.')
parser.add_argument('url', help='the URL at which to begin')
parser.add_argument("-n", "--number", type=int, help="the number of reachable website", default=15)
numEXECUTE = parser.parse_args().number
#print (numEXECUTE)
start_url = parser.parse_args().url
starting_netloc = urlsplit(start_url).netloc
url_filter = (lambda url: urlsplit(url).netloc == starting_netloc)
scrape((GET, start_url), url_filter, numEXECUTE)
if __name__ == '__main__':
main(GET)