Source code for pyminifier.analyze

# -*- coding: utf-8 -*-

__doc__ = """\
A module of useful functions for analyzing Python code.
"""

import keyword
import os
import re
import tokenize

shebang = re.compile("^#\!.*$")  # noqa
encoding = re.compile(".*coding[:=]\s*([-\w.]+)")  # noqa
# __builtins__ is different for every module so we need a hard-coded list:
builtins = [
    "ArithmeticError",
    "AssertionError",
    "AttributeError",
    "BaseException",
    "BufferError",
    "BytesWarning",
    "DeprecationWarning",
    "EOFError",
    "Ellipsis",
    "EnvironmentError",
    "Exception",
    "False",
    "FloatingPointError",
    "FutureWarning",
    "GeneratorExit",
    "IOError",
    "ImportError",
    "ImportWarning",
    "IndentationError",
    "IndexError",
    "KeyError",
    "KeyboardInterrupt",
    "LookupError",
    "MemoryError",
    "NameError",
    "None",
    "NotImplemented",
    "NotImplementedError",
    "OSError",
    "OverflowError",
    "PendingDeprecationWarning",
    "ReferenceError",
    "RuntimeError",
    "RuntimeWarning",
    "StandardError",
    "StopIteration",
    "SyntaxError",
    "SyntaxWarning",
    "SystemError",
    "SystemExit",
    "TabError",
    "True",
    "TypeError",
    "UnboundLocalError",
    "UnicodeDecodeError",
    "UnicodeEncodeError",
    "UnicodeError",
    "UnicodeTranslateError",
    "UnicodeWarning",
    "UserWarning",
    "ValueError",
    "Warning",
    "ZeroDivisionError",
    "__IPYTHON__",
    "__IPYTHON__active",
    "__debug__",
    "__doc__",
    "__import__",
    "__name__",
    "__package__",
    "abs",
    "all",
    "any",
    "apply",
    "basestring",
    "bin",
    "bool",
    "buffer",
    "bytearray",
    "bytes",
    "callable",
    "chr",
    "classmethod",
    "cmp",
    "coerce",
    "compile",
    "complex",
    "copyright",
    "credits",
    "delattr",
    "dict",
    "dir",
    "divmod",
    "dreload",
    "enumerate",
    "eval",
    "execfile",
    "exit",
    "file",
    "filter",
    "float",
    "format",
    "frozenset",
    "getattr",
    "globals",
    "hasattr",
    "hash",
    "help",
    "hex",
    "id",
    "input",
    "int",
    "intern",
    "ip_set_hook",
    "ipalias",
    "ipmagic",
    "ipsystem",
    "isinstance",
    "issubclass",
    "iter",
    "jobs",
    "len",
    "license",
    "list",
    "locals",
    "long",
    "map",
    "max",
    "min",
    "next",
    "object",
    "oct",
    "open",
    "ord",
    "pow",
    "print",
    "property",
    "quit",
    "range",
    "raw_input",
    "reduce",
    "reload",
    "repr",
    "reversed",
    "round",
    "set",
    "setattr",
    "slice",
    "sorted",
    "staticmethod",
    "str",
    "sum",
    "super",
    "tuple",
    "type",
    "unichr",
    "unicode",
    "vars",
    "xrange",
    "zip"
]

reserved_words = keyword.kwlist + builtins


[docs]def enumerate_keyword_args(tokens): """ Iterates over *tokens* and returns a dictionary with function names as the keys and lists of keyword arguments as the values. """ keyword_args = {} inside_function = False for index, tok in enumerate(tokens): token_type = tok[0] token_string = tok[1] if token_type == tokenize.NEWLINE: inside_function = False if token_type == tokenize.NAME: if token_string == "def": function_name = tokens[index+1][1] inside_function = function_name keyword_args.update({function_name: []}) elif inside_function: if tokens[index+1][1] == "=": # keyword argument keyword_args[function_name].append(token_string) return keyword_args
[docs]def enumerate_imports(tokens): """ Iterates over *tokens* and returns a list of all imported modules. .. note:: This ignores imports using the "as" and "from" keywords. """ imported_modules = [] import_line = False from_import = False for index, tok in enumerate(tokens): token_type = tok[0] token_string = tok[1] if token_type == tokenize.NEWLINE: import_line = False from_import = False elif token_string == "import": import_line = True elif token_string == "from": from_import = True elif import_line: if token_type == tokenize.NAME and tokens[index+1][1] != "as": if not from_import: if token_string not in reserved_words: if token_string not in imported_modules: imported_modules.append(token_string) return imported_modules
[docs]def enumerate_global_imports(tokens): """ Returns a list of all globally imported modules (skips modules imported inside of classes, methods, or functions). Example:: >>> enumerate_global_imports(tokens) ["sys", "os", "tokenize", "re"] .. note:: Does not enumerate imports using the "from" or "as" keywords. """ imported_modules = [] import_line = False from_import = False parent_module = "" function_count = 0 indentation = 0 for index, tok in enumerate(tokens): token_type = tok[0] token_string = tok[1] if token_type == tokenize.INDENT: indentation += 1 elif token_type == tokenize.DEDENT: indentation -= 1 elif token_type == tokenize.NEWLINE: import_line = False from_import = False elif token_type == tokenize.NAME: if token_string in ["def", "class"]: function_count += 1 if indentation == function_count - 1: function_count -= 1 elif function_count >= indentation: if token_string == "import": import_line = True elif token_string == "from": from_import = True elif import_line: if token_type == tokenize.NAME and tokens[index+1][1] != "as": if not from_import and token_string not in reserved_words: if token_string not in imported_modules: if tokens[index+1][1] == ".": # module.module parent_module = token_string + "." else: if parent_module: module_string = ( parent_module + token_string) imported_modules.append(module_string) parent_module = "" else: imported_modules.append(token_string) return imported_modules
# TODO: Finish this (even though it isn't used):
[docs]def enumerate_dynamic_imports(tokens): """ Returns a dictionary of all dynamically imported modules (those inside of classes or functions) in the form of {<func or class name>: [<modules>]} Example: >>> enumerate_dynamic_imports(tokens) {"myfunc": ["zlib", "base64"]} """ imported_modules = [] import_line = False for index, tok in enumerate(tokens): token_type = tok[0] token_string = tok[1] if token_type == tokenize.NEWLINE: import_line = False elif token_string == "import": try: if tokens[index-1][0] == tokenize.NEWLINE: import_line = True except IndexError: import_line = True # Just means this is the first line elif import_line: if token_type == tokenize.NAME and tokens[index+1][1] != "as": if token_string not in reserved_words: if token_string not in imported_modules: imported_modules.append(token_string) return imported_modules
[docs]def enumerate_method_calls(tokens, modules): """ Returns a list of all object (not module) method calls in the given tokens. *modules* is expected to be a list of all global modules imported into the source code we're working on. For example: >>> enumerate_method_calls(tokens) ["re.compile", "sys.argv", "f.write"] """ out = [] for index, tok in enumerate(tokens): token_type = tok[0] token_string = tok[1] if token_type == tokenize.NAME: next_tok_string = tokens[index+1][1] if next_tok_string == "(": # Method call prev_tok_string = tokens[index-1][1] # Check if we're attached to an object or module if prev_tok_string == ".": # We're attached prev_prev_tok_string = tokens[index-2][1] if prev_prev_tok_string not in ['""', "''", "]", ")", "}"]: if prev_prev_tok_string not in modules: to_replace = "%s.%s" % ( prev_prev_tok_string, token_string) if to_replace not in out: out.append(to_replace) return out
[docs]def enumerate_builtins(tokens): """ Returns a list of all the builtins being used in *tokens*. """ out = [] for index, tok in enumerate(tokens): # token_type = tok[0] token_string = tok[1] if token_string in builtins: # todo: I need to test if print can be replaced special_special = [] if token_string not in special_special: if not token_string.startswith("__"): # Don't count magic funcs if tokens[index-1][1] != "." and tokens[index+1][1] != "=": if token_string not in out: out.append(token_string) return out
[docs]def enumerate_import_methods(tokens): """ Returns a list of imported module methods (such as re.compile) inside *tokens*. """ global_imports = enumerate_global_imports(tokens) out = [] for item in global_imports: for index, tok in enumerate(tokens): try: next_tok = tokens[index+1] try: next_next_tok = tokens[index+2] except IndexError: # Pretend it is a newline next_next_tok = (54, "\n", (1, 1), (1, 2), "#\n") except IndexError: # Last token, no biggie # Pretend it is a newline here too next_tok = (54, "\n", (1, 1), (1, 2), "#\n") # token_type = tok[0] token_string = tok[1] if token_string == item: if next_tok[1] == ".": # We're calling a method module_method = "%s.%s" % (token_string, next_next_tok[1]) if module_method not in out: out.append(module_method) return out
[docs]def enumerate_local_modules(tokens, path): """ Returns a list of modules inside *tokens* that are local to *path*. **Note:** Will recursively look inside *path* for said modules. """ # Have to get a list of all modules before we can do anything else modules = enumerate_imports(tokens) local_modules = [] parent = "" # Now check the local dir for matching modules for root, dirs, files in os.walk(path): root = root.replace("\\", "/") if not parent: parent = os.path.split(root)[1] for f in files: if f.endswith(".py"): f = f[:-3] # Strip .py module_tree = root.split(parent)[1].replace("/", ".") module_tree = module_tree.lstrip(".") if module_tree: module = "%s.%s" % (module_tree, f) else: module = f if module not in modules: local_modules.append(module) return local_modules
[docs]def get_shebang(tokens): """ Returns the shebang string in *tokens* if it exists. None if not. """ # This (short) loop preserves shebangs and encoding strings: for tok in tokens[0:4]: # Will always be in the first four tokens line = tok[4] # Save the first comment line if it starts with a shebang # (e.g. "#!/usr/bin/env python") if shebang.match(line): # Must be first line return line