CoCalc -- test_baidusearch.py

GitHub Repository: laramies/theHarvester
Path: blob/master/tests/discovery/test_baidusearch.py
⁶⁰⁹ views
1
from _pytest.mark.structures import MarkDecorator
2
import pytest
3

4
from theHarvester.discovery import baidusearch
5

6
pytestmark: MarkDecorator = pytest.mark.asyncio
7

8

9
class TestBaiduSearch:
10
    async def test_process_and_parsing(self, monkeypatch):
11
        called = {}
12

13
        async def fake_fetch_all(urls, headers=None, proxy=False):
14
            called["urls"] = urls
15
            called["headers"] = headers
16
            called["proxy"] = proxy
17
            return [
18
                "Contact [email protected] on a.example.com \n",
19
                " [email protected] is here and www.example.com appears \n",
20
                " Visit sub.a.example.com. [email protected] \n",
21
            ]
22

23
        # Patch the AsyncFetcher.fetch_all to avoid network I/O
24
        import theHarvester.lib.core as core_module
25

26
        monkeypatch.setattr(core_module.AsyncFetcher, "fetch_all", fake_fetch_all)
27
        # Make user agent deterministic (not strictly necessary, but stable)
28
        monkeypatch.setattr(core_module.Core, "get_user_agent", staticmethod(lambda: "UA"), raising=True)
29

30
        search = baidusearch.SearchBaidu(word="example.com", limit=21)
31
        await search.process(proxy=True)
32

33
        expected_urls = [
34
            "https://www.baidu.com/s?wd=%40example.com&pn=0&oq=example.com",
35
            "https://www.baidu.com/s?wd=%40example.com&pn=10&oq=example.com",
36
            "https://www.baidu.com/s?wd=%40example.com&pn=20&oq=example.com",
37
        ]
38
        assert called["urls"] == expected_urls
39
        assert called["proxy"] is True
40

41
        emails = await search.get_emails()
42
        hosts = await search.get_hostnames()
43

44
        # Ensure our expected values are present
45
        assert "[email protected]" in emails
46
        assert "[email protected]" in emails
47
        assert "[email protected]" in emails
48

49
        assert {"a.example.com", "www.example.com", "sub.a.example.com"} <= set(hosts)
50

51
    async def test_pagination_limit_exclusive(self, monkeypatch):
52
        captured = {}
53

54
        async def fake_fetch_all(urls, headers=None, proxy=False):
55
            captured["urls"] = urls
56
            return [""] * len(urls)
57

58
        import theHarvester.lib.core as core_module
59

60
        monkeypatch.setattr(core_module.AsyncFetcher, "fetch_all", fake_fetch_all)
61
        monkeypatch.setattr(core_module.Core, "get_user_agent", staticmethod(lambda: "UA"), raising=True)
62

63
        search = baidusearch.SearchBaidu(word="example.com", limit=20)
64
        await search.process()
65

66
        # For limit=20, range(0, 20, 10) yields 0 and 10 only (20 is excluded)
67
        assert captured["urls"] == [
68
            "https://www.baidu.com/s?wd=%40example.com&pn=0&oq=example.com",
69
            "https://www.baidu.com/s?wd=%40example.com&pn=10&oq=example.com",
70
        ]
71

72
Product

Resources

Company