diff options
| author | robot-piglet <[email protected]> | 2023-12-02 01:45:21 +0300 |
|---|---|---|
| committer | robot-piglet <[email protected]> | 2023-12-02 02:42:50 +0300 |
| commit | 9c43d58f75cf086b744cf4fe2ae180e8f37e4a0c (patch) | |
| tree | 9f88a486917d371d099cd712efd91b4c122d209d /contrib/python/python-magic/py3/magic/__init__.py | |
| parent | 32fb6dda1feb24f9ab69ece5df0cb9ec238ca5e6 (diff) | |
Intermediate changes
Diffstat (limited to 'contrib/python/python-magic/py3/magic/__init__.py')
| -rw-r--r-- | contrib/python/python-magic/py3/magic/__init__.py | 469 |
1 files changed, 469 insertions, 0 deletions
diff --git a/contrib/python/python-magic/py3/magic/__init__.py b/contrib/python/python-magic/py3/magic/__init__.py new file mode 100644 index 00000000000..bab7c7b122d --- /dev/null +++ b/contrib/python/python-magic/py3/magic/__init__.py @@ -0,0 +1,469 @@ +""" +magic is a wrapper around the libmagic file identification library. + +See README for more information. + +Usage: + +>>> import magic +>>> magic.from_file("testdata/test.pdf") +'PDF document, version 1.2' +>>> magic.from_file("testdata/test.pdf", mime=True) +'application/pdf' +>>> magic.from_buffer(open("testdata/test.pdf").read(1024)) +'PDF document, version 1.2' +>>> + +""" + +import sys +import glob +import ctypes +import ctypes.util +import threading +import logging + +from ctypes import c_char_p, c_int, c_size_t, c_void_p, byref, POINTER + +# avoid shadowing the real open with the version from compat.py +_real_open = open + + +class MagicException(Exception): + def __init__(self, message): + super(Exception, self).__init__(message) + self.message = message + + +class Magic: + """ + Magic is a wrapper around the libmagic C library. + """ + + def __init__(self, mime=False, magic_file=None, mime_encoding=False, + keep_going=False, uncompress=False, raw=False, extension=False): + """ + Create a new libmagic wrapper. + + mime - if True, mimetypes are returned instead of textual descriptions + mime_encoding - if True, codec is returned + magic_file - use a mime database other than the system default + keep_going - don't stop at the first match, keep going + uncompress - Try to look inside compressed files. + raw - Do not try to decode "non-printable" chars. + extension - Print a slash-separated list of valid extensions for the file type found. + """ + self.flags = MAGIC_NONE + if mime: + self.flags |= MAGIC_MIME_TYPE + if mime_encoding: + self.flags |= MAGIC_MIME_ENCODING + if keep_going: + self.flags |= MAGIC_CONTINUE + if uncompress: + self.flags |= MAGIC_COMPRESS + if raw: + self.flags |= MAGIC_RAW + if extension: + self.flags |= MAGIC_EXTENSION + + self.cookie = magic_open(self.flags) + self.lock = threading.Lock() + + magic_load(self.cookie, magic_file) + + # MAGIC_EXTENSION was added in 523 or 524, so bail if + # it doesn't appear to be available + if extension and (not _has_version or version() < 524): + raise NotImplementedError('MAGIC_EXTENSION is not supported in this version of libmagic') + + # For https://github.com/ahupp/python-magic/issues/190 + # libmagic has fixed internal limits that some files exceed, causing + # an error. We can avoid this (at least for the sample file given) + # by bumping the limit up. It's not clear if this is a general solution + # or whether other internal limits should be increased, but given + # the lack of other reports I'll assume this is rare. + if _has_param: + try: + self.setparam(MAGIC_PARAM_NAME_MAX, 64) + except MagicException as e: + # some versions of libmagic fail this call, + # so rather than fail hard just use default behavior + pass + + def from_buffer(self, buf): + """ + Identify the contents of `buf` + """ + with self.lock: + try: + # if we're on python3, convert buf to bytes + # otherwise this string is passed as wchar* + # which is not what libmagic expects + # NEXTBREAK: only take bytes + if type(buf) == str and str != bytes: + buf = buf.encode('utf-8', errors='replace') + return maybe_decode(magic_buffer(self.cookie, buf)) + except MagicException as e: + return self._handle509Bug(e) + + def from_file(self, filename): + # raise FileNotFoundException or IOError if the file does not exist + with _real_open(filename): + pass + + with self.lock: + try: + return maybe_decode(magic_file(self.cookie, filename)) + except MagicException as e: + return self._handle509Bug(e) + + def from_descriptor(self, fd): + with self.lock: + try: + return maybe_decode(magic_descriptor(self.cookie, fd)) + except MagicException as e: + return self._handle509Bug(e) + + def _handle509Bug(self, e): + # libmagic 5.09 has a bug where it might fail to identify the + # mimetype of a file and returns null from magic_file (and + # likely _buffer), but also does not return an error message. + if e.message is None and (self.flags & MAGIC_MIME_TYPE): + return "application/octet-stream" + else: + raise e + + def setparam(self, param, val): + return magic_setparam(self.cookie, param, val) + + def getparam(self, param): + return magic_getparam(self.cookie, param) + + def __del__(self): + # no _thread_check here because there can be no other + # references to this object at this point. + + # during shutdown magic_close may have been cleared already so + # make sure it exists before using it. + + # the self.cookie check should be unnecessary and was an + # incorrect fix for a threading problem, however I'm leaving + # it in because it's harmless and I'm slightly afraid to + # remove it. + if hasattr(self, 'cookie') and self.cookie and magic_close: + magic_close(self.cookie) + self.cookie = None + + +_instances = {} + + +def _get_magic_type(mime): + i = _instances.get(mime) + if i is None: + i = _instances[mime] = Magic(mime=mime) + return i + + +def from_file(filename, mime=False): + """" + Accepts a filename and returns the detected filetype. Return + value is the mimetype if mime=True, otherwise a human readable + name. + + >>> magic.from_file("testdata/test.pdf", mime=True) + 'application/pdf' + """ + m = _get_magic_type(mime) + return m.from_file(filename) + + +def from_buffer(buffer, mime=False): + """ + Accepts a binary string and returns the detected filetype. Return + value is the mimetype if mime=True, otherwise a human readable + name. + + >>> magic.from_buffer(open("testdata/test.pdf").read(1024)) + 'PDF document, version 1.2' + """ + m = _get_magic_type(mime) + return m.from_buffer(buffer) + + +def from_descriptor(fd, mime=False): + """ + Accepts a file descriptor and returns the detected filetype. Return + value is the mimetype if mime=True, otherwise a human readable + name. + + >>> f = open("testdata/test.pdf") + >>> magic.from_descriptor(f.fileno()) + 'PDF document, version 1.2' + """ + m = _get_magic_type(mime) + return m.from_descriptor(fd) + +from . import loader +libmagic = loader.load_lib() + +magic_t = ctypes.c_void_p + + +def errorcheck_null(result, func, args): + if result is None: + err = magic_error(args[0]) + raise MagicException(err) + else: + return result + + +def errorcheck_negative_one(result, func, args): + if result == -1: + err = magic_error(args[0]) + raise MagicException(err) + else: + return result + + +# return str on python3. Don't want to unconditionally +# decode because that results in unicode on python2 +def maybe_decode(s): + # NEXTBREAK: remove + if str == bytes: + return s + else: + # backslashreplace here because sometimes libmagic will return metadata in the charset + # of the file, which is unknown to us (e.g the title of a Word doc) + return s.decode('utf-8', 'backslashreplace') + + +try: + from os import PathLike + def unpath(filename): + if isinstance(filename, PathLike): + return filename.__fspath__() + else: + return filename +except ImportError: + def unpath(filename): + return filename + +def coerce_filename(filename): + if filename is None: + return None + + filename = unpath(filename) + + # ctypes will implicitly convert unicode strings to bytes with + # .encode('ascii'). If you use the filesystem encoding + # then you'll get inconsistent behavior (crashes) depending on the user's + # LANG environment variable + # NEXTBREAK: remove + is_unicode = (sys.version_info[0] <= 2 and + isinstance(filename, unicode)) or \ + (sys.version_info[0] >= 3 and + isinstance(filename, str)) + if is_unicode: + return filename.encode('utf-8', 'surrogateescape') + else: + return filename + + +magic_open = libmagic.magic_open +magic_open.restype = magic_t +magic_open.argtypes = [c_int] + +magic_close = libmagic.magic_close +magic_close.restype = None +magic_close.argtypes = [magic_t] + +magic_error = libmagic.magic_error +magic_error.restype = c_char_p +magic_error.argtypes = [magic_t] + +magic_errno = libmagic.magic_errno +magic_errno.restype = c_int +magic_errno.argtypes = [magic_t] + +_magic_file = libmagic.magic_file +_magic_file.restype = c_char_p +_magic_file.argtypes = [magic_t, c_char_p] +_magic_file.errcheck = errorcheck_null + + +def magic_file(cookie, filename): + return _magic_file(cookie, coerce_filename(filename)) + + +_magic_buffer = libmagic.magic_buffer +_magic_buffer.restype = c_char_p +_magic_buffer.argtypes = [magic_t, c_void_p, c_size_t] +_magic_buffer.errcheck = errorcheck_null + + +def magic_buffer(cookie, buf): + return _magic_buffer(cookie, buf, len(buf)) + + +magic_descriptor = libmagic.magic_descriptor +magic_descriptor.restype = c_char_p +magic_descriptor.argtypes = [magic_t, c_int] +magic_descriptor.errcheck = errorcheck_null + +_magic_descriptor = libmagic.magic_descriptor +_magic_descriptor.restype = c_char_p +_magic_descriptor.argtypes = [magic_t, c_int] +_magic_descriptor.errcheck = errorcheck_null + + +def magic_descriptor(cookie, fd): + return _magic_descriptor(cookie, fd) + + +_magic_load = libmagic.magic_load +_magic_load.restype = c_int +_magic_load.argtypes = [magic_t, c_char_p] +_magic_load.errcheck = errorcheck_negative_one + + +def magic_load(cookie, filename): + return _magic_load(cookie, coerce_filename(filename)) + + +magic_setflags = libmagic.magic_setflags +magic_setflags.restype = c_int +magic_setflags.argtypes = [magic_t, c_int] + +magic_check = libmagic.magic_check +magic_check.restype = c_int +magic_check.argtypes = [magic_t, c_char_p] + +magic_compile = libmagic.magic_compile +magic_compile.restype = c_int +magic_compile.argtypes = [magic_t, c_char_p] + +_has_param = False +if hasattr(libmagic, 'magic_setparam') and hasattr(libmagic, 'magic_getparam'): + _has_param = True + _magic_setparam = libmagic.magic_setparam + _magic_setparam.restype = c_int + _magic_setparam.argtypes = [magic_t, c_int, POINTER(c_size_t)] + _magic_setparam.errcheck = errorcheck_negative_one + + _magic_getparam = libmagic.magic_getparam + _magic_getparam.restype = c_int + _magic_getparam.argtypes = [magic_t, c_int, POINTER(c_size_t)] + _magic_getparam.errcheck = errorcheck_negative_one + + +def magic_setparam(cookie, param, val): + if not _has_param: + raise NotImplementedError("magic_setparam not implemented") + v = c_size_t(val) + return _magic_setparam(cookie, param, byref(v)) + + +def magic_getparam(cookie, param): + if not _has_param: + raise NotImplementedError("magic_getparam not implemented") + val = c_size_t() + _magic_getparam(cookie, param, byref(val)) + return val.value + + +_has_version = False +if hasattr(libmagic, "magic_version"): + _has_version = True + magic_version = libmagic.magic_version + magic_version.restype = c_int + magic_version.argtypes = [] + + +def version(): + if not _has_version: + raise NotImplementedError("magic_version not implemented") + return magic_version() + + +MAGIC_NONE = 0x000000 # No flags +MAGIC_DEBUG = 0x000001 # Turn on debugging +MAGIC_SYMLINK = 0x000002 # Follow symlinks +MAGIC_COMPRESS = 0x000004 # Check inside compressed files +MAGIC_DEVICES = 0x000008 # Look at the contents of devices +MAGIC_MIME_TYPE = 0x000010 # Return a mime string +MAGIC_MIME_ENCODING = 0x000400 # Return the MIME encoding +# TODO: should be +# MAGIC_MIME = MAGIC_MIME_TYPE | MAGIC_MIME_ENCODING +MAGIC_MIME = 0x000010 # Return a mime string +MAGIC_EXTENSION = 0x1000000 # Return a /-separated list of extensions + +MAGIC_CONTINUE = 0x000020 # Return all matches +MAGIC_CHECK = 0x000040 # Print warnings to stderr +MAGIC_PRESERVE_ATIME = 0x000080 # Restore access time on exit +MAGIC_RAW = 0x000100 # Don't translate unprintable chars +MAGIC_ERROR = 0x000200 # Handle ENOENT etc as real errors + +MAGIC_NO_CHECK_COMPRESS = 0x001000 # Don't check for compressed files +MAGIC_NO_CHECK_TAR = 0x002000 # Don't check for tar files +MAGIC_NO_CHECK_SOFT = 0x004000 # Don't check magic entries +MAGIC_NO_CHECK_APPTYPE = 0x008000 # Don't check application type +MAGIC_NO_CHECK_ELF = 0x010000 # Don't check for elf details +MAGIC_NO_CHECK_ASCII = 0x020000 # Don't check for ascii files +MAGIC_NO_CHECK_TROFF = 0x040000 # Don't check ascii/troff +MAGIC_NO_CHECK_FORTRAN = 0x080000 # Don't check ascii/fortran +MAGIC_NO_CHECK_TOKENS = 0x100000 # Don't check ascii/tokens + +MAGIC_PARAM_INDIR_MAX = 0 # Recursion limit for indirect magic +MAGIC_PARAM_NAME_MAX = 1 # Use count limit for name/use magic +MAGIC_PARAM_ELF_PHNUM_MAX = 2 # Max ELF notes processed +MAGIC_PARAM_ELF_SHNUM_MAX = 3 # Max ELF program sections processed +MAGIC_PARAM_ELF_NOTES_MAX = 4 # # Max ELF sections processed +MAGIC_PARAM_REGEX_MAX = 5 # Length limit for regex searches +MAGIC_PARAM_BYTES_MAX = 6 # Max number of bytes to read from file + + +# This package name conflicts with the one provided by upstream +# libmagic. This is a common source of confusion for users. To +# resolve, We ship a copy of that module, and expose it's functions +# wrapped in deprecation warnings. +def _add_compat(to_module): + import warnings, re + from magic import compat + + def deprecation_wrapper(fn): + def _(*args, **kwargs): + warnings.warn( + "Using compatibility mode with libmagic's python binding. " + "See https://github.com/ahupp/python-magic/blob/master/COMPAT.md for details.", + PendingDeprecationWarning) + + return fn(*args, **kwargs) + + return _ + + fn = ['detect_from_filename', + 'detect_from_content', + 'detect_from_fobj', + 'open'] + for fname in fn: + to_module[fname] = deprecation_wrapper(compat.__dict__[fname]) + + # copy constants over, ensuring there's no conflicts + is_const_re = re.compile("^[A-Z_]+$") + allowed_inconsistent = set(['MAGIC_MIME']) + for name, value in compat.__dict__.items(): + if is_const_re.match(name): + if name in to_module: + if name in allowed_inconsistent: + continue + if to_module[name] != value: + raise Exception("inconsistent value for " + name) + else: + continue + else: + to_module[name] = value + + +_add_compat(globals()) |
