Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
maurosoria
GitHub Repository: maurosoria/dirsearch
Path: blob/master/lib/utils/crawl.py
896 views
1
# -*- coding: utf-8 -*-
2
# This program is free software; you can redistribute it and/or modify
3
# it under the terms of the GNU General Public License as published by
4
# the Free Software Foundation; either version 2 of the License, or
5
# (at your option) any later version.
6
#
7
# This program is distributed in the hope that it will be useful,
8
# but WITHOUT ANY WARRANTY; without even the implied warranty of
9
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
# GNU General Public License for more details.
11
#
12
# You should have received a copy of the GNU General Public License
13
# along with this program; if not, write to the Free Software
14
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
15
# MA 02110-1301, USA.
16
#
17
# Author: Mauro Soria
18
19
import re
20
21
from bs4 import BeautifulSoup
22
from functools import lru_cache
23
24
from lib.core.settings import (
25
CRAWL_ATTRIBUTES, CRAWL_TAGS,
26
MEDIA_EXTENSIONS, ROBOTS_TXT_REGEX,
27
URI_REGEX,
28
)
29
from lib.parse.url import clean_path, parse_path
30
from lib.utils.common import merge_path
31
32
33
def _filter(paths):
34
return {clean_path(path, keep_queries=True) for path in paths if not path.endswith(MEDIA_EXTENSIONS)}
35
36
37
class Crawler:
38
@classmethod
39
def crawl(cls, response):
40
scope = "/".join(response.url.split("/")[:3]) + "/"
41
42
if "text/html" in response.headers.get("content-type", ""):
43
return cls.html_crawl(response.url, scope, response.content)
44
elif response.path == "robots.txt":
45
return cls.robots_crawl(response.url, scope, response.content)
46
else:
47
return cls.text_crawl(response.url, scope, response.content)
48
49
@staticmethod
50
@lru_cache(maxsize=None)
51
def text_crawl(url, scope, content):
52
results = []
53
regex = re.escape(scope) + "[a-zA-Z0-9-._~!$&*+,;=:@?%]+"
54
55
for match in re.findall(regex, content):
56
results.append(match[len(scope):])
57
58
return _filter(results)
59
60
@staticmethod
61
@lru_cache(maxsize=None)
62
def html_crawl(url, scope, content):
63
results = []
64
soup = BeautifulSoup(content, 'html.parser')
65
66
for tag in CRAWL_TAGS:
67
for found in soup.find_all(tag):
68
for attr in CRAWL_ATTRIBUTES:
69
value = found.get(attr)
70
71
if not value:
72
continue
73
74
if value.startswith("/"):
75
results.append(value[1:])
76
elif value.startswith(scope):
77
results.append(value[len(scope):])
78
elif not re.search(URI_REGEX, value):
79
new_url = merge_path(url, value)
80
results.append(parse_path(new_url))
81
82
return _filter(results)
83
84
@staticmethod
85
@lru_cache(maxsize=None)
86
def robots_crawl(url, scope, content):
87
return _filter(re.findall(ROBOTS_TXT_REGEX, content))
88
89