CoCalc -- cfake.py

GitHub Repository: mikf/gallery-dl
Path: blob/master/gallery_dl/extractor/cfake.py
⁸⁸¹⁰ views
1
# -*- coding: utf-8 -*-
2

3
# This program is free software; you can redistribute it and/or modify
4
# it under the terms of the GNU General Public License version 2 as
5
# published by the Free Software Foundation.
6

7
"""Extractors for https://cfake.com/"""
8

9
from .common import Extractor, Message
10
from .. import text
11

12
BASE_PATTERN = r"(?:https?://)?(?:www\.)?cfake\.com"
13

14

15
class CfakeExtractor(Extractor):
16
    """Base class for cfake extractors"""
17
    category = "cfake"
18
    root = "https://cfake.com"
19
    directory_fmt = ("{category}", "{type}", "{type_name} ({type_id})")
20
    filename_fmt = "{category}_{type_name}_{id}.{extension}"
21
    archive_fmt = "{id}"
22

23
    def items(self):
24
        type, type_name, type_id, sub_id, pnum = self.groups
25

26
        if type.endswith("ies"):
27
            type = type[:-3] + "y"
28

29
        kwdict = self.kwdict
30
        kwdict["type"] = type
31
        kwdict["type_id"] = text.parse_int(type_id)
32
        kwdict["type_name"] = text.unquote(type_name).replace("_", " ")
33
        kwdict["sub_id"] = text.parse_int(sub_id)
34
        kwdict["page"] = pnum = text.parse_int(pnum, 1)
35
        yield Message.Directory, "", {}
36

37
        base = f"{self.root}/images/{type}/{type_name}/{type_id}"
38
        if sub_id:
39
            base = f"{base}/{sub_id}"
40

41
        while True:
42
            url = base if pnum < 2 else f"{base}/p{pnum}"
43
            page = self.request(url).text
44

45
            # Extract and yield images
46
            num = 0
47
            for image in self._extract_images(page):
48
                num += 1
49
                image["num"] = num + (pnum - 1) * 50
50
                url = image["url"]
51
                yield Message.Url, url, text.nameext_from_url(url, image)
52

53
            # Check for next page
54
            if not num or not (pnum := self._check_pagination(page)):
55
                return
56
            kwdict["page"] = pnum
57

58
    def _extract_images(self, page):
59
        """Extract image URLs and metadata from a gallery page"""
60
        for item in text.extract_iter(
61
                page, '<a href="javascript:showimage(', '</div></div>'):
62

63
            # Extract image path from showimage call
64
            # Format: 'big.php?show=2025/filename.jpg&id_picture=...
65
            show_param = text.extr(item, "show=", "&")
66
            if not show_param:
67
                continue
68

69
            # Extract metadata
70
            picture_id = text.extr(item, "id_picture=", "&")
71
            name_param = text.extr(item, "p_name=", "'")
72

73
            # Extract date
74
            date = text.extr(item, 'id="date_vignette">', '</div>')
75

76
            # Extract rating
77
            rating_text = text.extr(item, 'class="current-rating"', '</li>')
78
            rating = text.extr(rating_text, 'width:', 'px')
79

80
            # Convert thumbnail path to full image path
81
            # show_param is like "2025/filename.jpg"
82
            image_url = f"{self.root}/medias/photos/{show_param}"
83

84
            yield {
85
                "url": image_url,
86
                "id": text.parse_int(picture_id) if picture_id else 0,
87
                "name": text.unescape(name_param) if name_param else "",
88
                "date": date,
89
                "rating": rating,
90
            }
91

92
    def _check_pagination(self, page):
93
        """Check if there are more pages and return next page number"""
94
        # Look for current page indicator
95
        # Format: id="num_page_current" ><a href=".../ p1">1</a>
96
        current_section = text.extr(
97
            page, 'id="num_page_current"', '</div>')
98
        if not current_section:
99
            return None
100

101
        # Extract current page number from the link text
102
        current_page_str = text.extr(current_section, '">', '</a>')
103
        if not current_page_str:
104
            return None
105

106
        current_page = text.parse_int(current_page_str)
107
        if not current_page:
108
            return None
109

110
        next_page = current_page + 1
111

112
        # Check if next page link exists anywhere in the page
113
        # Look for href="/images/.../pN" pattern
114
        if f'/p{next_page}"' in page or f'/p{next_page} ' in page:
115
            return next_page
116

117
        return None
118

119

120
class CfakeCelebrityExtractor(CfakeExtractor):
121
    """Extractor for celebrity image galleries from cfake.com"""
122
    subcategory = "celebrity"
123
    pattern = (BASE_PATTERN + r"/images/(celebrity)"
124
               r"/([^/?#]+)/(\d+)()(?:/p(\d+))?")
125
    example = "https://cfake.com/images/celebrity/NAME/123"
126

127

128
class CfakeCategoryExtractor(CfakeExtractor):
129
    """Extractor for category image galleries from cfake.com"""
130
    subcategory = "category"
131
    pattern = (BASE_PATTERN + r"/images/(categories)"
132
               r"/([^/?#]+)/(\d+)()(?:/p(\d+))?")
133
    example = "https://cfake.com/images/categories/NAME/123"
134

135

136
class CfakeCreatedExtractor(CfakeExtractor):
137
    """Extractor for 'created' image galleries from cfake.com"""
138
    subcategory = "created"
139
    pattern = (BASE_PATTERN + r"/images/(created)"
140
               r"/([^/?#]+)/(\d+)/(\d+)(?:/p(\d+))?")
141
    example = "https://cfake.com/images/created/NAME/12345/123"
142

143

144
class CfakeCountryExtractor(CfakeExtractor):
145
    """Extractor for country image galleries from cfake.com"""
146
    subcategory = "country"
147
    pattern = (BASE_PATTERN + r"/images/(country)"
148
               r"/([^/?#]+)/(\d+)/(\d+)(?:/p(\d+))?")
149
    example = "https://cfake.com/images/country/NAME/12345/123"
150

151
Product

Resources

Company