Source code for pyminifier

# -*- coding: utf-8 -*-
#
#       Copyright 2013 Liftoff Software Corporation
#                 2022 Dong Zhuang
# For license information see LICENSE.txt

# Meta
__version__ = "2.3.3"
__version_info__ = (2, 3, 3)
__license__ = "GPLv3"  # See LICENSE.txt
__author__ = (
    "Dan McDougall <daniel.mcdougall@liftoffsoftware.com>,"
    "Dong Zhuang <dzhuang.scut@gmail.com>")

# TODO: Add the ability to mark variables, functions, classes, and methods
#       for non-obfuscation.
# TODO: Add the ability to selectively obfuscate identifiers inside strings
#      (for metaprogramming stuff).
# TODO: Add the ability to use a config file instead of just command line args.
# TODO: Add the ability to save a file that allows for de-obfuscation later
#      (or at least the ability to debug).
# TODO: Separate out the individual functions of minification so that they can
#      be chosen selectively like the obfuscation functions.
# TODO: A conflict file entry in the windows operating system

__doc__ = """\
**Python Minifier:**  Reduces the size of (minifies) Python code for use on
embedded platforms.

Performs the following:

    * Removes docstrings.
    * Removes comments.
    * Minimizes code indentation.
    * Removes trailing commas.
    * Joins multiline pairs of parentheses, braces, and brackets (and removes
      extraneous whitespace within).
    * Joins disjointed strings like, ("some" "disjointed" "string") into single
      strings: ('''some disjointed string''')
    * Preserves shebangs and encoding info (e.g. "# -- coding: utf-8 --").
    * Optionally, produces a bzip2 or gzip-compressed self-extracting python script
      containing the minified source for ultimate minification.
      *Added in version 1.4*
    * Optionally, obfuscates the code using the shortest possible combination of
      letters and numbers for one or all of class names, function/method names, and
      variables. The options are ``--obfuscate`` or ``-O`` to obfuscate everything,
      ``--obfuscate-variables``, ``--obfuscate-functions``,
      and ``--obfuscate-classes`` to obfuscate things individually
      (say, if you wanted to keep your module usable by external programs).
      *Added in version 2.0*
    * Optionally, a value may be specified via --replacement-length to set the
      minimum length of random strings that are used to replace identifier names
      when obfuscating.
    * Optionally, you may specify ``--nonlatin`` to use funky
      unicode characters when obfuscating. WARNING: This will result in some
      seriously hard-to-read code! **Tip:** Combine this setting with higher
      ``--replacement-length`` values to make the output even wackier.
      *Added in version 2.0*
    * Pyminifier can now minify/obfuscate an arbitrary number of Python
      scripts in one go.  For example, ``pyminifier -O *.py`` will minify and
      obfuscate all files in the current directory ending in .py.  To prevent
      issues with using differentiated obfuscated identifiers across multiple files,
      pyminifier will keep track of what replaces what via a lookup table to ensure
      foo_module.whatever is gets the same replacement across all source files.
      *Added in version 2.0*
    * Optionally, creates an executable zip archive (pyz) containing the
      minified/obfuscated source script and all implicit (local path) imported
      modules.  This mechanism automatically figures out which source files to
      include in the .pyz archive by analyzing the script passed to pyminifier
      on the command line (listing all the modules your script uses is unnecessary).
      This is also the **ultimate** in minification/compression besting both the
      gzip and bzip2 compression mechanisms with the disadvantage that .pyz files
      cannot be imported into other Python scripts.
      *Added in version 2.0*

Just how much space can be saved by pyminifier? Here's a comparison:

    * The pyminifier3 (as of version 2.1) source (all 7 files) takes up about 106k.
    * Performing basic minification on all pyminifier3 source files reduces that
      to ~104k.
    * Minification plus obfuscation provides a further reduction to 92k.
    * Minification plus the base64-encoded gzip trick (--gzip) reduces it to 76k.
    * Minification plus gzip compression plus obfuscation is also 76k
      (demonstrating that obfuscation makes no difference when compression
      algorithms are used).
    * Using the --pyz option on pyminifier.py creates a ~14k .pyz file that
      includes all the aforementioned files.

Various examples and edge cases are sprinkled throughout the pyminifier code so
that it can be tested by minifying itself.  The way to test is thus:

.. code-block:: bash

    $ python __main__.py __main__.py > minified_pyminifier.py
    $ python minified_pyminifier.py __main__.py > this_should_be_identical.py
    $ diff minified_pyminifier.py this_should_be_identical.py
    $

If you get an error executing minified_pyminifier.py or
``this_should_be_identical.py`` isn't identical to minified_pyminifier.py then
something is broken.

.. note::

    The test functions below are meaningless.  They only serve as test/edge
    cases for testing pyminifier3.
"""

import io
import os
import re
import sys
from collections.abc import Iterable

from . import compression, minification, obfuscate, token_utils

lzma = False
try:
    import lzma
except ImportError:
    pass

# define the name of the operating system "nt"- windows
os_name = os.name

# Regexes
multiline_indicator = re.compile("\\\\(\s*#.*)?\n")  # noqa

# The test.+() functions below are for testing pyminifier...

RESULT_FOOTER = "# Created by pyminifier (https://github.com/dzhuang/pyminifier3)"


[docs]def test_decorator(f):
    """Decorator that does nothing"""
    return f


[docs]def test_reduce_operators():
    """Test the case where an operator such as an open paren starts a line"""
    (a, b) = 1, 2  # The indentation level should be preserved  # noqa
    pass


[docs]def test_empty_functions():
    """
    This is a test function.
    This should be replaced with "def test_empty_functions(): pass"
    """


[docs]class test_class(object):
    """Testing indented decorators"""

    @test_decorator
    def test_function(self):
        pass


[docs]def test_function():
    """
    This function encapsulates the edge cases to prevent them from invading the
    global namespace.
    """
    # This tests method obfuscation:
    method_obfuscate = test_class()
    method_obfuscate.test_function()
    foo = ("The # character in this string should "  # This comment  # noqa
           "not result in a syntax error")  # ...and this one should go away  # noqa
    test_multi_line_list = [  # noqa
        "item1",
        "item2",
        "item3"
    ]
    test_multi_line_dict = {  # noqa
        "item1": 1,
        "item2": 2,
        "item3": 3
    }
    # It may seem strange but the code below tests our docstring removal code.
    test_string_inside_operators = imaginary_function(  # noqa
        "This string was indented but the tokenizer won't see it that way."
    )  # To understand how this could mess up docstring removal code see the
    # minification.minification.remove_comments_and_docstrings() function
    # starting at this line:
    #     "elif token_type == tokenize.STRING:"
    # This tests remove_extraneous_spaces():
    this_line_has_leading_indentation    = """<--That extraneous space should be
                                              removed"""  # But not these spaces  # noqa


[docs]def is_iterable(obj):
    """
    Returns `True` if *obj* is iterable but *not* if *obj* is a string, bytes,
    or a bytearray.
    """
    if isinstance(obj, (str, bytes, bytearray)):
        return False
    return isinstance(obj, Iterable)


[docs]def pyminify(options, files):
    """
    Given an *options* object (from `optparse.OptionParser` or similar),
    performs minification and/or obfuscation on the given *files* (any iterable
    containing file paths) based on said *options*.

    All accepted options can　be listed by running ``python __main__.py -h`` or
    examining the :py:func:`__init__.main` function.
    """
    global name_generator
    if not is_iterable(files):
        print(
            "Error: The 'files' argument must be a list, tuple, etc of files.  "
            "Strings and bytes won't work.")
        sys.exit(1)
    if options.pyz:
        # Check to make sure we were only passed one script (only one at a time)
        if len(files) > 1:
            print("ERROR: The --pyz option only works with one python file at "
                  "a time.")
            print("(Dependencies will be automagically included in the "
                  "resulting .pyz)")
            sys.exit(1)
        # Make our .pyz:
        compression.zip_pack(files[0], options)
        return None  # Make sure we don't do anything else
    # Read in our prepend text (if any)
    prepend = None
    if options.prepend:
        try:
            with open(options.prepend, encoding="utf-8") as f:
                prepend = f.read()
        except Exception as err:
            print("Error reading %s:" % options.prepend)
            print(err)

    obfuscations = (options.obfuscate, options.obf_classes,
                    options.obf_functions, options.obf_variables,
                    options.obf_builtins, options.obf_import_methods)

    # Automatically enable obfuscation if --nonlatin (implied if no explicit
    # obfuscation is stated)
    if options.use_nonlatin and not any(obfuscations):
        options.obfuscate = True
    if len(files) > 1:  # We're dealing with more than one file
        name_generator = None  # So we can tell if we need to obfuscate
        if any(obfuscations):
            # Put together that will be used for all obfuscation functions:
            identifier_length = int(options.replacement_length)
            if options.use_nonlatin:
                name_generator = obfuscate.obfuscation_machine(
                    use_unicode=True, identifier_length=identifier_length
                )
            else:
                name_generator = obfuscate.obfuscation_machine(
                    identifier_length=identifier_length)
            table = [{}]
        cumulative_size = 0  # For size reduction stats
        cumulative_new = 0  # Ditto
        for sourcefile in files:
            # Record how big the file is so we can compare afterwards
            filesize = os.path.getsize(sourcefile)
            cumulative_size += filesize
            # Get the module name from the path
            module = os.path.split(sourcefile)[1]
            module = ".".join(module.split(".")[:-1])
            with open(sourcefile, encoding="utf-8") as _f:
                source = _f.read()
            tokens = token_utils.listified_tokenizer(source)
            if not options.nominify:  # Perform minification
                source = minification.minify(tokens, options)
            # Have to re-tokenize for obfucation (it is quick):
            tokens = token_utils.listified_tokenizer(source)
            # Perform obfuscation if any of the related options were set
            if name_generator:
                obfuscate.obfuscate(
                    module,
                    tokens,
                    options,
                    name_generator=name_generator,
                    table=table
                )
            # Convert back to text
            result = "# -*- coding: utf-8 -*-\n"
            if prepend:
                result += prepend
            result += token_utils.untokenize(tokens)
            # Compress it if we were asked to do so
            if options.bzip2:
                result = compression.bz2_pack(result)
            elif options.gzip:
                result = compression.gz_pack(result)
            elif lzma and options.lzma:
                result = compression.lzma_pack(result)
            result += "{}\n".format(RESULT_FOOTER)
            # Either save the result to the output file or print it to stdout
            if not os.path.exists(options.destdir):
                os.mkdir(options.destdir)
            # Need the path where the script lives for the next steps:
            filepath = os.path.split(sourcefile)[1]
            path = options.destdir + "/" + filepath  # Put everything in destdir
            with open(path, "w", encoding="utf-8") as f:
                f.write(result)
            new_filesize = os.path.getsize(path)
            cumulative_new += new_filesize
            percent_saved = (
                round((float(new_filesize) / float(filesize)) * 100, 2)
                if int(filesize) != 0 else 0)
            print(
                "{sourcefile} ({filesize}) reduced to {new_filesize} bytes "
                "({percent_saved}% of original size)".format(**locals()))
        if cumulative_size:
            p_saved = round(
                (float(cumulative_new) / float(cumulative_size) * 100), 2)
        else:
            p_saved = 0
        print("Overall size reduction: {0}% of original size".format(p_saved))
    else:
        # Get the module name from the path
        _file = files[0]
        module = os.path.split(_file)[1]
        module = ".".join(module.split(".")[:-1])
        filesize = os.path.getsize(_file)
        with open(_file, encoding="utf-8") as f:
            source = f.read()

        # Convert the tokens from a tuple of tuples to a list of lists so we can
        # update in-place.
        tokens = token_utils.listified_tokenizer(source)
        if not options.nominify:  # Perform minification
            source = minification.minify(tokens, options)
            # Convert back to tokens in case we're obfuscating
            tokens = token_utils.listified_tokenizer(source)
        # Perform obfuscation if any of the related options were set
        if options.obfuscate or options.obf_classes or options.obf_functions \
                or options.obf_variables or options.obf_builtins \
                or options.obf_import_methods:
            identifier_length = int(options.replacement_length)
            name_generator = obfuscate.obfuscation_machine(
                use_unicode=options.use_nonlatin,
                identifier_length=identifier_length)
            obfuscate.obfuscate(module, tokens, options)
        # Convert back to text
        result = ""
        if prepend:
            result += prepend
        result += token_utils.untokenize(tokens)
        # Compress it if we were asked to do so
        if options.bzip2:
            result = compression.bz2_pack(result)
        elif options.gzip:
            result = compression.gz_pack(result)
        elif lzma and options.lzma:
            result = compression.lzma_pack(result)
        result += "{}\n".format(RESULT_FOOTER)
        # Either save the result to the output file or print it to stdout
        if options.outfile:
            with io.open(options.outfile, "w", encoding="utf-8") as f:
                f.write(result)
            new_filesize = os.path.getsize(options.outfile)
            percent_saved = (
                round(float(new_filesize) / float(filesize) * 100, 2)
                if int(filesize) != 0 else 0)
            print((
                "{_file} ({filesize}) reduced to {new_filesize} bytes "
                "({percent_saved}% of original size)".format(**locals())))
        else:
            print(result)