Path: blob/master/sherlock_project/sherlock.py
761 views
#! /usr/bin/env python312"""3Sherlock: Find Usernames Across Social Networks Module45This module contains the main logic to search for usernames at social6networks.7"""89import sys1011try:12from sherlock_project.__init__ import import_error_test_var # noqa: F40113except ImportError:14print("Did you run Sherlock with `python3 sherlock/sherlock.py ...`?")15print("This is an outdated method. Please see https://sherlockproject.xyz/installation for up to date instructions.")16sys.exit(1)1718import csv19import signal20import pandas as pd21import os22import re23from argparse import ArgumentParser, RawDescriptionHelpFormatter24from json import loads as json_loads25from time import monotonic26from typing import Optional2728import requests29from requests_futures.sessions import FuturesSession3031from sherlock_project.__init__ import (32__longname__,33__shortname__,34__version__,35forge_api_latest_release,36)3738from sherlock_project.result import QueryStatus39from sherlock_project.result import QueryResult40from sherlock_project.notify import QueryNotify41from sherlock_project.notify import QueryNotifyPrint42from sherlock_project.sites import SitesInformation43from colorama import init44from argparse import ArgumentTypeError454647class SherlockFuturesSession(FuturesSession):48def request(self, method, url, hooks=None, *args, **kwargs):49"""Request URL.5051This extends the FuturesSession request method to calculate a response52time metric to each request.5354It is taken (almost) directly from the following Stack Overflow answer:55https://github.com/ross/requests-futures#working-in-the-background5657Keyword Arguments:58self -- This object.59method -- String containing method desired for request.60url -- String containing URL for request.61hooks -- Dictionary containing hooks to execute after62request finishes.63args -- Arguments.64kwargs -- Keyword arguments.6566Return Value:67Request object.68"""69# Record the start time for the request.70if hooks is None:71hooks = {}72start = monotonic()7374def response_time(resp, *args, **kwargs):75"""Response Time Hook.7677Keyword Arguments:78resp -- Response object.79args -- Arguments.80kwargs -- Keyword arguments.8182Return Value:83Nothing.84"""85resp.elapsed = monotonic() - start8687return8889# Install hook to execute when response completes.90# Make sure that the time measurement hook is first, so we will not91# track any later hook's execution time.92try:93if isinstance(hooks["response"], list):94hooks["response"].insert(0, response_time)95elif isinstance(hooks["response"], tuple):96# Convert tuple to list and insert time measurement hook first.97hooks["response"] = list(hooks["response"])98hooks["response"].insert(0, response_time)99else:100# Must have previously contained a single hook function,101# so convert to list.102hooks["response"] = [response_time, hooks["response"]]103except KeyError:104# No response hook was already defined, so install it ourselves.105hooks["response"] = [response_time]106107return super(SherlockFuturesSession, self).request(108method, url, hooks=hooks, *args, **kwargs109)110111112def get_response(request_future, error_type, social_network):113# Default for Response object if some failure occurs.114response = None115116error_context = "General Unknown Error"117exception_text = None118try:119response = request_future.result()120if response.status_code:121# Status code exists in response object122error_context = None123except requests.exceptions.HTTPError as errh:124error_context = "HTTP Error"125exception_text = str(errh)126except requests.exceptions.ProxyError as errp:127error_context = "Proxy Error"128exception_text = str(errp)129except requests.exceptions.ConnectionError as errc:130error_context = "Error Connecting"131exception_text = str(errc)132except requests.exceptions.Timeout as errt:133error_context = "Timeout Error"134exception_text = str(errt)135except requests.exceptions.RequestException as err:136error_context = "Unknown Error"137exception_text = str(err)138139return response, error_context, exception_text140141142def interpolate_string(input_object, username):143if isinstance(input_object, str):144return input_object.replace("{}", username)145elif isinstance(input_object, dict):146return {k: interpolate_string(v, username) for k, v in input_object.items()}147elif isinstance(input_object, list):148return [interpolate_string(i, username) for i in input_object]149return input_object150151152def check_for_parameter(username):153"""checks if {?} exists in the username154if exist it means that sherlock is looking for more multiple username"""155return "{?}" in username156157158checksymbols = ["_", "-", "."]159160161def multiple_usernames(username):162"""replace the parameter with with symbols and return a list of usernames"""163allUsernames = []164for i in checksymbols:165allUsernames.append(username.replace("{?}", i))166return allUsernames167168169def sherlock(170username: str,171site_data: dict[str, dict[str, str]],172query_notify: QueryNotify,173dump_response: bool = False,174proxy: Optional[str] = None,175timeout: int = 60,176) -> dict[str, dict[str, str | QueryResult]]:177"""Run Sherlock Analysis.178179Checks for existence of username on various social media sites.180181Keyword Arguments:182username -- String indicating username that report183should be created against.184site_data -- Dictionary containing all of the site data.185query_notify -- Object with base type of QueryNotify().186This will be used to notify the caller about187query results.188proxy -- String indicating the proxy URL189timeout -- Time in seconds to wait before timing out request.190Default is 60 seconds.191192Return Value:193Dictionary containing results from report. Key of dictionary is the name194of the social network site, and the value is another dictionary with195the following keys:196url_main: URL of main site.197url_user: URL of user on site (if account exists).198status: QueryResult() object indicating results of test for199account existence.200http_status: HTTP status code of query which checked for existence on201site.202response_text: Text that came back from request. May be None if203there was an HTTP error when checking for existence.204"""205206# Notify caller that we are starting the query.207query_notify.start(username)208209# Normal requests210underlying_session = requests.session()211212# Limit number of workers to 20.213# This is probably vastly overkill.214if len(site_data) >= 20:215max_workers = 20216else:217max_workers = len(site_data)218219# Create multi-threaded session for all requests.220session = SherlockFuturesSession(221max_workers=max_workers, session=underlying_session222)223224# Results from analysis of all sites225results_total = {}226227# First create futures for all requests. This allows for the requests to run in parallel228for social_network, net_info in site_data.items():229# Results from analysis of this specific site230results_site = {"url_main": net_info.get("urlMain")}231232# Record URL of main site233234# A user agent is needed because some sites don't return the correct235# information since they think that we are bots (Which we actually are...)236headers = {237"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:129.0) Gecko/20100101 Firefox/129.0",238}239240if "headers" in net_info:241# Override/append any extra headers required by a given site.242headers.update(net_info["headers"])243244# URL of user on site (if it exists)245url = interpolate_string(net_info["url"], username.replace(' ', '%20'))246247# Don't make request if username is invalid for the site248regex_check = net_info.get("regexCheck")249if regex_check and re.search(regex_check, username) is None:250# No need to do the check at the site: this username is not allowed.251results_site["status"] = QueryResult(252username, social_network, url, QueryStatus.ILLEGAL253)254results_site["url_user"] = ""255results_site["http_status"] = ""256results_site["response_text"] = ""257query_notify.update(results_site["status"])258else:259# URL of user on site (if it exists)260results_site["url_user"] = url261url_probe = net_info.get("urlProbe")262request_method = net_info.get("request_method")263request_payload = net_info.get("request_payload")264request = None265266if request_method is not None:267if request_method == "GET":268request = session.get269elif request_method == "HEAD":270request = session.head271elif request_method == "POST":272request = session.post273elif request_method == "PUT":274request = session.put275else:276raise RuntimeError(f"Unsupported request_method for {url}")277278if request_payload is not None:279request_payload = interpolate_string(request_payload, username)280281if url_probe is None:282# Probe URL is normal one seen by people out on the web.283url_probe = url284else:285# There is a special URL for probing existence separate286# from where the user profile normally can be found.287url_probe = interpolate_string(url_probe, username)288289if request is None:290if net_info["errorType"] == "status_code":291# In most cases when we are detecting by status code,292# it is not necessary to get the entire body: we can293# detect fine with just the HEAD response.294request = session.head295else:296# Either this detect method needs the content associated297# with the GET response, or this specific website will298# not respond properly unless we request the whole page.299request = session.get300301if net_info["errorType"] == "response_url":302# Site forwards request to a different URL if username not303# found. Disallow the redirect so we can capture the304# http status from the original URL request.305allow_redirects = False306else:307# Allow whatever redirect that the site wants to do.308# The final result of the request will be what is available.309allow_redirects = True310311# This future starts running the request in a new thread, doesn't block the main thread312if proxy is not None:313proxies = {"http": proxy, "https": proxy}314future = request(315url=url_probe,316headers=headers,317proxies=proxies,318allow_redirects=allow_redirects,319timeout=timeout,320json=request_payload,321)322else:323future = request(324url=url_probe,325headers=headers,326allow_redirects=allow_redirects,327timeout=timeout,328json=request_payload,329)330331# Store future in data for access later332net_info["request_future"] = future333334# Add this site's results into final dictionary with all the other results.335results_total[social_network] = results_site336337# Open the file containing account links338for social_network, net_info in site_data.items():339# Retrieve results again340results_site = results_total.get(social_network)341342# Retrieve other site information again343url = results_site.get("url_user")344status = results_site.get("status")345if status is not None:346# We have already determined the user doesn't exist here347continue348349# Get the expected error type350error_type = net_info["errorType"]351if isinstance(error_type, str):352error_type: list[str] = [error_type]353354# Retrieve future and ensure it has finished355future = net_info["request_future"]356r, error_text, exception_text = get_response(357request_future=future, error_type=error_type, social_network=social_network358)359360# Get response time for response of our request.361try:362response_time = r.elapsed363except AttributeError:364response_time = None365366# Attempt to get request information367try:368http_status = r.status_code369except Exception:370http_status = "?"371try:372response_text = r.text.encode(r.encoding or "UTF-8")373except Exception:374response_text = ""375376query_status = QueryStatus.UNKNOWN377error_context = None378379# As WAFs advance and evolve, they will occasionally block Sherlock and380# lead to false positives and negatives. Fingerprints should be added381# here to filter results that fail to bypass WAFs. Fingerprints should382# be highly targetted. Comment at the end of each fingerprint to383# indicate target and date fingerprinted.384WAFHitMsgs = [385r'.loading-spinner{visibility:hidden}body.no-js .challenge-running{display:none}body.dark{background-color:#222;color:#d9d9d9}body.dark a{color:#fff}body.dark a:hover{color:#ee730a;text-decoration:underline}body.dark .lds-ring div{border-color:#999 transparent transparent}body.dark .font-red{color:#b20f03}body.dark', # 2024-05-13 Cloudflare386r'<span id="challenge-error-text">', # 2024-11-11 Cloudflare error page387r'AwsWafIntegration.forceRefreshToken', # 2024-11-11 Cloudfront (AWS)388r'{return l.onPageView}}),Object.defineProperty(r,"perimeterxIdentifiers",{enumerable:' # 2024-04-09 PerimeterX / Human Security389]390391if error_text is not None:392error_context = error_text393394elif any(hitMsg in r.text for hitMsg in WAFHitMsgs):395query_status = QueryStatus.WAF396397else:398if any(errtype not in ["message", "status_code", "response_url"] for errtype in error_type):399error_context = f"Unknown error type '{error_type}' for {social_network}"400query_status = QueryStatus.UNKNOWN401else:402if "message" in error_type:403# error_flag True denotes no error found in the HTML404# error_flag False denotes error found in the HTML405error_flag = True406errors = net_info.get("errorMsg")407# errors will hold the error message408# it can be string or list409# by isinstance method we can detect that410# and handle the case for strings as normal procedure411# and if its list we can iterate the errors412if isinstance(errors, str):413# Checks if the error message is in the HTML414# if error is present we will set flag to False415if errors in r.text:416error_flag = False417else:418# If it's list, it will iterate all the error message419for error in errors:420if error in r.text:421error_flag = False422break423if error_flag:424query_status = QueryStatus.CLAIMED425else:426query_status = QueryStatus.AVAILABLE427428if "status_code" in error_type and query_status is not QueryStatus.AVAILABLE:429error_codes = net_info.get("errorCode")430query_status = QueryStatus.CLAIMED431432# Type consistency, allowing for both singlets and lists in manifest433if isinstance(error_codes, int):434error_codes = [error_codes]435436if error_codes is not None and r.status_code in error_codes:437query_status = QueryStatus.AVAILABLE438elif r.status_code >= 300 or r.status_code < 200:439query_status = QueryStatus.AVAILABLE440441if "response_url" in error_type and query_status is not QueryStatus.AVAILABLE:442# For this detection method, we have turned off the redirect.443# So, there is no need to check the response URL: it will always444# match the request. Instead, we will ensure that the response445# code indicates that the request was successful (i.e. no 404, or446# forward to some odd redirect).447if 200 <= r.status_code < 300:448query_status = QueryStatus.CLAIMED449else:450query_status = QueryStatus.AVAILABLE451452if dump_response:453print("+++++++++++++++++++++")454print(f"TARGET NAME : {social_network}")455print(f"USERNAME : {username}")456print(f"TARGET URL : {url}")457print(f"TEST METHOD : {error_type}")458try:459print(f"STATUS CODES : {net_info['errorCode']}")460except KeyError:461pass462print("Results...")463try:464print(f"RESPONSE CODE : {r.status_code}")465except Exception:466pass467try:468print(f"ERROR TEXT : {net_info['errorMsg']}")469except KeyError:470pass471print(">>>>> BEGIN RESPONSE TEXT")472try:473print(r.text)474except Exception:475pass476print("<<<<< END RESPONSE TEXT")477print("VERDICT : " + str(query_status))478print("+++++++++++++++++++++")479480# Notify caller about results of query.481result: QueryResult = QueryResult(482username=username,483site_name=social_network,484site_url_user=url,485status=query_status,486query_time=response_time,487context=error_context,488)489query_notify.update(result)490491# Save status of request492results_site["status"] = result493494# Save results from request495results_site["http_status"] = http_status496results_site["response_text"] = response_text497498# Add this site's results into final dictionary with all of the other results.499results_total[social_network] = results_site500501return results_total502503504def timeout_check(value):505"""Check Timeout Argument.506507Checks timeout for validity.508509Keyword Arguments:510value -- Time in seconds to wait before timing out request.511512Return Value:513Floating point number representing the time (in seconds) that should be514used for the timeout.515516NOTE: Will raise an exception if the timeout in invalid.517"""518519float_value = float(value)520521if float_value <= 0:522raise ArgumentTypeError(523f"Invalid timeout value: {value}. Timeout must be a positive number."524)525526return float_value527528529def handler(signal_received, frame):530"""Exit gracefully without throwing errors531532Source: https://www.devdungeon.com/content/python-catch-sigint-ctrl-c533"""534sys.exit(0)535536537def main():538parser = ArgumentParser(539formatter_class=RawDescriptionHelpFormatter,540description=f"{__longname__} (Version {__version__})",541)542parser.add_argument(543"--version",544action="version",545version=f"{__shortname__} v{__version__}",546help="Display version information and dependencies.",547)548parser.add_argument(549"--verbose",550"-v",551"-d",552"--debug",553action="store_true",554dest="verbose",555default=False,556help="Display extra debugging information and metrics.",557)558parser.add_argument(559"--folderoutput",560"-fo",561dest="folderoutput",562help="If using multiple usernames, the output of the results will be saved to this folder.",563)564parser.add_argument(565"--output",566"-o",567dest="output",568help="If using single username, the output of the result will be saved to this file.",569)570parser.add_argument(571"--csv",572action="store_true",573dest="csv",574default=False,575help="Create Comma-Separated Values (CSV) File.",576)577parser.add_argument(578"--xlsx",579action="store_true",580dest="xlsx",581default=False,582help="Create the standard file for the modern Microsoft Excel spreadsheet (xlsx).",583)584parser.add_argument(585"--site",586action="append",587metavar="SITE_NAME",588dest="site_list",589default=[],590help="Limit analysis to just the listed sites. Add multiple options to specify more than one site.",591)592parser.add_argument(593"--proxy",594"-p",595metavar="PROXY_URL",596action="store",597dest="proxy",598default=None,599help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080",600)601parser.add_argument(602"--dump-response",603action="store_true",604dest="dump_response",605default=False,606help="Dump the HTTP response to stdout for targeted debugging.",607)608parser.add_argument(609"--json",610"-j",611metavar="JSON_FILE",612dest="json_file",613default=None,614help="Load data from a JSON file or an online, valid, JSON file. Upstream PR numbers also accepted.",615)616parser.add_argument(617"--timeout",618action="store",619metavar="TIMEOUT",620dest="timeout",621type=timeout_check,622default=60,623help="Time (in seconds) to wait for response to requests (Default: 60)",624)625parser.add_argument(626"--print-all",627action="store_true",628dest="print_all",629default=False,630help="Output sites where the username was not found.",631)632parser.add_argument(633"--print-found",634action="store_true",635dest="print_found",636default=True,637help="Output sites where the username was found (also if exported as file).",638)639parser.add_argument(640"--no-color",641action="store_true",642dest="no_color",643default=False,644help="Don't color terminal output",645)646parser.add_argument(647"username",648nargs="+",649metavar="USERNAMES",650action="store",651help="One or more usernames to check with social networks. Check similar usernames using {?} (replace to '_', '-', '.').",652)653parser.add_argument(654"--browse",655"-b",656action="store_true",657dest="browse",658default=False,659help="Browse to all results on default browser.",660)661662parser.add_argument(663"--local",664"-l",665action="store_true",666default=False,667help="Force the use of the local data.json file.",668)669670parser.add_argument(671"--nsfw",672action="store_true",673default=False,674help="Include checking of NSFW sites from default list.",675)676677parser.add_argument(678"--txt",679action="store_true",680dest="output_txt",681default=False,682help="Enable creation of a txt file",683)684685parser.add_argument(686"--ignore-exclusions",687action="store_true",688dest="ignore_exclusions",689default=False,690help="Ignore upstream exclusions (may return more false positives)",691)692693args = parser.parse_args()694695# If the user presses CTRL-C, exit gracefully without throwing errors696signal.signal(signal.SIGINT, handler)697698# Check for newer version of Sherlock. If it exists, let the user know about it699try:700latest_release_raw = requests.get(forge_api_latest_release, timeout=10).text701latest_release_json = json_loads(latest_release_raw)702latest_remote_tag = latest_release_json["tag_name"]703704if latest_remote_tag[1:] != __version__:705print(706f"Update available! {__version__} --> {latest_remote_tag[1:]}"707f"\n{latest_release_json['html_url']}"708)709710except Exception as error:711print(f"A problem occurred while checking for an update: {error}")712713# Make prompts714if args.proxy is not None:715print("Using the proxy: " + args.proxy)716717if args.no_color:718# Disable color output.719init(strip=True, convert=False)720else:721# Enable color output.722init(autoreset=True)723724# Check if both output methods are entered as input.725if args.output is not None and args.folderoutput is not None:726print("You can only use one of the output methods.")727sys.exit(1)728729# Check validity for single username output.730if args.output is not None and len(args.username) != 1:731print("You can only use --output with a single username")732sys.exit(1)733734# Create object with all information about sites we are aware of.735try:736if args.local:737sites = SitesInformation(738os.path.join(os.path.dirname(__file__), "resources/data.json"),739honor_exclusions=False,740)741else:742json_file_location = args.json_file743if args.json_file:744# If --json parameter is a number, interpret it as a pull request number745if args.json_file.isnumeric():746pull_number = args.json_file747pull_url = f"https://api.github.com/repos/sherlock-project/sherlock/pulls/{pull_number}"748pull_request_raw = requests.get(pull_url, timeout=10).text749pull_request_json = json_loads(pull_request_raw)750751# Check if it's a valid pull request752if "message" in pull_request_json:753print(f"ERROR: Pull request #{pull_number} not found.")754sys.exit(1)755756head_commit_sha = pull_request_json["head"]["sha"]757json_file_location = f"https://raw.githubusercontent.com/sherlock-project/sherlock/{head_commit_sha}/sherlock_project/resources/data.json"758759sites = SitesInformation(760data_file_path=json_file_location,761honor_exclusions=not args.ignore_exclusions,762do_not_exclude=args.site_list,763)764except Exception as error:765print(f"ERROR: {error}")766sys.exit(1)767768if not args.nsfw:769sites.remove_nsfw_sites(do_not_remove=args.site_list)770771# Create original dictionary from SitesInformation() object.772# Eventually, the rest of the code will be updated to use the new object773# directly, but this will glue the two pieces together.774site_data_all = {site.name: site.information for site in sites}775if args.site_list == []:776# Not desired to look at a sub-set of sites777site_data = site_data_all778else:779# User desires to selectively run queries on a sub-set of the site list.780# Make sure that the sites are supported & build up pruned site database.781site_data = {}782site_missing = []783for site in args.site_list:784counter = 0785for existing_site in site_data_all:786if site.lower() == existing_site.lower():787site_data[existing_site] = site_data_all[existing_site]788counter += 1789if counter == 0:790# Build up list of sites not supported for future error message.791site_missing.append(f"'{site}'")792793if site_missing:794print(f"Error: Desired sites not found: {', '.join(site_missing)}.")795796if not site_data:797sys.exit(1)798799# Create notify object for query results.800query_notify = QueryNotifyPrint(801result=None, verbose=args.verbose, print_all=args.print_all, browse=args.browse802)803804# Run report on all specified users.805all_usernames = []806for username in args.username:807if check_for_parameter(username):808for name in multiple_usernames(username):809all_usernames.append(name)810else:811all_usernames.append(username)812for username in all_usernames:813results = sherlock(814username,815site_data,816query_notify,817dump_response=args.dump_response,818proxy=args.proxy,819timeout=args.timeout,820)821822if args.output:823result_file = args.output824elif args.folderoutput:825# The usernames results should be stored in a targeted folder.826# If the folder doesn't exist, create it first827os.makedirs(args.folderoutput, exist_ok=True)828result_file = os.path.join(args.folderoutput, f"{username}.txt")829else:830result_file = f"{username}.txt"831832if args.output_txt:833with open(result_file, "w", encoding="utf-8") as file:834exists_counter = 0835for website_name in results:836dictionary = results[website_name]837if dictionary.get("status").status == QueryStatus.CLAIMED:838exists_counter += 1839file.write(dictionary["url_user"] + "\n")840file.write(f"Total Websites Username Detected On : {exists_counter}\n")841842if args.csv:843result_file = f"{username}.csv"844if args.folderoutput:845# The usernames results should be stored in a targeted folder.846# If the folder doesn't exist, create it first847os.makedirs(args.folderoutput, exist_ok=True)848result_file = os.path.join(args.folderoutput, result_file)849850with open(result_file, "w", newline="", encoding="utf-8") as csv_report:851writer = csv.writer(csv_report)852writer.writerow(853[854"username",855"name",856"url_main",857"url_user",858"exists",859"http_status",860"response_time_s",861]862)863for site in results:864if (865args.print_found866and not args.print_all867and results[site]["status"].status != QueryStatus.CLAIMED868):869continue870871response_time_s = results[site]["status"].query_time872if response_time_s is None:873response_time_s = ""874writer.writerow(875[876username,877site,878results[site]["url_main"],879results[site]["url_user"],880str(results[site]["status"].status),881results[site]["http_status"],882response_time_s,883]884)885if args.xlsx:886usernames = []887names = []888url_main = []889url_user = []890exists = []891http_status = []892response_time_s = []893894for site in results:895if (896args.print_found897and not args.print_all898and results[site]["status"].status != QueryStatus.CLAIMED899):900continue901902if response_time_s is None:903response_time_s.append("")904else:905response_time_s.append(results[site]["status"].query_time)906usernames.append(username)907names.append(site)908url_main.append(results[site]["url_main"])909url_user.append(results[site]["url_user"])910exists.append(str(results[site]["status"].status))911http_status.append(results[site]["http_status"])912913DataFrame = pd.DataFrame(914{915"username": usernames,916"name": names,917"url_main": [f'=HYPERLINK(\"{u}\")' for u in url_main],918"url_user": [f'=HYPERLINK(\"{u}\")' for u in url_user],919"exists": exists,920"http_status": http_status,921"response_time_s": response_time_s,922}923)924DataFrame.to_excel(f"{username}.xlsx", sheet_name="sheet1", index=False)925926print()927query_notify.finish()928929930if __name__ == "__main__":931main()932933934