CoCalc -- crawl.py

GitHub Repository: maurosoria/dirsearch
Path: blob/master/lib/utils/crawl.py
⁸⁹⁶ views
1
# -*- coding: utf-8 -*-
2
#  This program is free software; you can redistribute it and/or modify
3
#  it under the terms of the GNU General Public License as published by
4
#  the Free Software Foundation; either version 2 of the License, or
5
#  (at your option) any later version.
6
#
7
#  This program is distributed in the hope that it will be useful,
8
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
9
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
#  GNU General Public License for more details.
11
#
12
#  You should have received a copy of the GNU General Public License
13
#  along with this program; if not, write to the Free Software
14
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
15
#  MA 02110-1301, USA.
16
#
17
#  Author: Mauro Soria
18

19
import re
20

21
from bs4 import BeautifulSoup
22
from functools import lru_cache
23

24
from lib.core.settings import (
25
    CRAWL_ATTRIBUTES, CRAWL_TAGS,
26
    MEDIA_EXTENSIONS, ROBOTS_TXT_REGEX,
27
    URI_REGEX,
28
)
29
from lib.parse.url import clean_path, parse_path
30
from lib.utils.common import merge_path
31

32

33
def _filter(paths):
34
    return {clean_path(path, keep_queries=True) for path in paths if not path.endswith(MEDIA_EXTENSIONS)}
35

36

37
class Crawler:
38
    @classmethod
39
    def crawl(cls, response):
40
        scope = "/".join(response.url.split("/")[:3]) + "/"
41

42
        if "text/html" in response.headers.get("content-type", ""):
43
            return cls.html_crawl(response.url, scope, response.content)
44
        elif response.path == "robots.txt":
45
            return cls.robots_crawl(response.url, scope, response.content)
46
        else:
47
            return cls.text_crawl(response.url, scope, response.content)
48

49
    @staticmethod
50
    @lru_cache(maxsize=None)
51
    def text_crawl(url, scope, content):
52
        results = []
53
        regex = re.escape(scope) + "[a-zA-Z0-9-._~!$&*+,;=:@?%]+"
54

55
        for match in re.findall(regex, content):
56
            results.append(match[len(scope):])
57

58
        return _filter(results)
59

60
    @staticmethod
61
    @lru_cache(maxsize=None)
62
    def html_crawl(url, scope, content):
63
        results = []
64
        soup = BeautifulSoup(content, 'html.parser')
65

66
        for tag in CRAWL_TAGS:
67
            for found in soup.find_all(tag):
68
                for attr in CRAWL_ATTRIBUTES:
69
                    value = found.get(attr)
70

71
                    if not value:
72
                        continue
73

74
                    if value.startswith("/"):
75
                        results.append(value[1:])
76
                    elif value.startswith(scope):
77
                        results.append(value[len(scope):])
78
                    elif not re.search(URI_REGEX, value):
79
                        new_url = merge_path(url, value)
80
                        results.append(parse_path(new_url))
81

82
        return _filter(results)
83

84
    @staticmethod
85
    @lru_cache(maxsize=None)
86
    def robots_crawl(url, scope, content):
87
        return _filter(re.findall(ROBOTS_TXT_REGEX, content))
88

89
Product

Resources

Company