from __future__ import annotations import argparse import sys import typing from json import dumps from os.path import abspath, basename, dirname, join, realpath from platform import python_version from unicodedata import unidata_version import charset_normalizer.md as md_module from charset_normalizer import from_fp from charset_normalizer.models import CliDetectionResult from charset_normalizer.version import __version__ def query_yes_no(question: str, default: str = "yes") -> bool: # Defensive: """Ask a yes/no question via input() and return the answer as a bool.""" prompt = " [Y/n] " if default == "yes" else " [y/N] " while True: choice = input(question + prompt).strip().lower() if not choice: return default == "yes" if choice in ("y", "yes"): return True if choice in ("n", "no"): return False print("Please respond with 'y' or 'n'.") class FileType: """Factory for creating file object types Instances of FileType are typically passed as type= arguments to the ArgumentParser add_argument() method. Keyword Arguments: - mode -- A string indicating how the file is to be opened. Accepts the same values as the builtin open() function. - bufsize -- The file's desired buffer size. Accepts the same values as the builtin open() function. - encoding -- The file's encoding. Accepts the same values as the builtin open() function. - errors -- A string indicating how encoding and decoding errors are to be handled. Accepts the same value as the builtin open() function. Backported from CPython 3.12 """ def __init__( self, mode: str = "r", bufsize: int = -1, encoding: str | None = None, errors: str | None = None, ): self._mode = mode self._bufsize = bufsize self._encoding = encoding self._errors = errors def __call__(self, string: str) -> typing.IO: # type: ignore[type-arg] # the special argument "-" means sys.std{in,out} if string == "-": if "r" in self._mode: return sys.stdin.buffer if "b" in self._mode else sys.stdin elif any(c in self._mode for c in "wax"): return sys.stdout.buffer if "b" in self._mode else sys.stdout else: msg = f'argument "-" with mode {self._mode}' raise ValueError(msg) # all other arguments are used as file names try: return open(string, self._mode, self._bufsize, self._encoding, self._errors) except OSError as e: message = f"can't open '{string}': {e}" raise argparse.ArgumentTypeError(message) def __repr__(self) -> str: args = self._mode, self._bufsize kwargs = [("encoding", self._encoding), ("errors", self._errors)] args_str = ", ".join( [repr(arg) for arg in args if arg != -1] + [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None] ) return f"{type(self).__name__}({args_str})" def cli_detect(argv: list[str] | None = None) -> int: """ CLI assistant using ARGV and ArgumentParser :param argv: :return: 0 if everything is fine, anything else equal trouble """ parser = argparse.ArgumentParser( description="The Real First Universal Charset Detector. " "Discover originating encoding used on text file. " "Normalize text to unicode." ) parser.add_argument( "files", type=FileType("rb"), nargs="+", help="File(s) to be analysed" ) parser.add_argument( "-v", "--verbose", action="store_true", default=False, dest="verbose", help="Display complementary information about file if any. " "Stdout will contain logs about the detection process.", ) parser.add_argument( "-a", "--with-alternative", action="store_true", default=False, dest="alternatives", help="Output complementary possibilities if any. Top-level JSON WILL be a list.", ) parser.add_argument( "-n", "--normalize", action="store_true", default=False, dest="normalize", help="Permit to normalize input file. If not set, program does not write anything.", ) parser.add_argument( "-m", "--minimal", action="store_true", default=False, dest="minimal", help="Only output the charset detected to STDOUT. Disabling JSON output.", ) parser.add_argument( "-r", "--replace", action="store_true", default=False, dest="replace", help="Replace file when trying to normalize it instead of creating a new one.", ) parser.add_argument( "-f", "--force", action="store_true", default=False, dest="force", help="Replace file without asking if you are sure, use this flag with caution.", ) parser.add_argument( "-i", "--no-preemptive", action="store_true", default=False, dest="no_preemptive", help="Disable looking at a charset declaration to hint the detector.", ) parser.add_argument( "-t", "--threshold", action="store", default=0.2, type=float, dest="threshold", help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.", ) parser.add_argument( "--version", action="version", version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format( __version__, python_version(), unidata_version, "OFF" if md_module.__file__.lower().endswith(".py") else "ON", ), help="Show version information and exit.", ) args = parser.parse_args(argv) if args.replace is True and args.normalize is False: if args.files: for my_file in args.files: my_file.close() print("Use --replace in addition of --normalize only.", file=sys.stderr) return 1 if args.force is True and args.replace is False: if args.files: for my_file in args.files: my_file.close() print("Use --force in addition of --replace only.", file=sys.stderr) return 1 if args.threshold < 0.0 or args.threshold > 1.0: if args.files: for my_file in args.files: my_file.close() print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr) return 1 x_ = [] for my_file in args.files: matches = from_fp( my_file, threshold=args.threshold, explain=args.verbose, preemptive_behaviour=args.no_preemptive is False, ) best_guess = matches.best() if best_guess is None: print( 'Unable to identify originating encoding for "{}". {}'.format( my_file.name, ( "Maybe try increasing maximum amount of chaos." if args.threshold < 1.0 else "" ), ), file=sys.stderr, ) x_.append( CliDetectionResult( abspath(my_file.name), None, [], [], "Unknown", [], False, 1.0, 0.0, None, True, ) ) else: cli_result = CliDetectionResult( abspath(my_file.name), best_guess.encoding, best_guess.encoding_aliases, [ cp for cp in best_guess.could_be_from_charset if cp != best_guess.encoding ], best_guess.language, best_guess.alphabets, best_guess.bom, best_guess.percent_chaos, best_guess.percent_coherence, None, True, ) x_.append(cli_result) if len(matches) > 1 and args.alternatives: for el in matches: if el != best_guess: x_.append( CliDetectionResult( abspath(my_file.name), el.encoding, el.encoding_aliases, [ cp for cp in el.could_be_from_charset if cp != el.encoding ], el.language, el.alphabets, el.bom, el.percent_chaos, el.percent_coherence, None, False, ) ) if args.normalize is True: if best_guess.encoding.startswith("utf") is True: print( '"{}" file does not need to be normalized, as it already came from unicode.'.format( my_file.name ), file=sys.stderr, ) if my_file.closed is False: my_file.close() continue dir_path = dirname(realpath(my_file.name)) file_name = basename(realpath(my_file.name)) o_: list[str] = file_name.split(".") if args.replace is False: o_.insert(-1, best_guess.encoding) if my_file.closed is False: my_file.close() elif ( args.force is False and query_yes_no( 'Are you sure to normalize "{}" by replacing it ?'.format( my_file.name ), "no", ) is False ): if my_file.closed is False: my_file.close() continue try: cli_result.unicode_path = join(dir_path, ".".join(o_)) with open(cli_result.unicode_path, "wb") as fp: fp.write(best_guess.output()) except OSError as e: # Defensive: print(str(e), file=sys.stderr) if my_file.closed is False: my_file.close() return 2 if my_file.closed is False: my_file.close() if args.minimal is False: print( dumps( [el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__, ensure_ascii=True, indent=4, ) ) else: for my_file in args.files: print( ", ".join( [ el.encoding or "undefined" for el in x_ if el.path == abspath(my_file.name) ] ) ) return 0 if __name__ == "__main__": # Defensive: cli_detect()